From b3b1905fb25732c758b761dcd8a863086c5de373 Mon Sep 17 00:00:00 2001 From: anonpenguin23 Date: Wed, 31 Dec 2025 10:16:26 +0200 Subject: [PATCH] feat: refactor API gateway and CLI utilities for improved functionality - Updated the API gateway documentation to reflect changes in architecture and functionality, emphasizing its role as a multi-functional entry point for decentralized services. - Refactored CLI commands to utilize utility functions for better code organization and maintainability. - Introduced new utility functions for handling peer normalization, service management, and port validation, enhancing the overall CLI experience. - Added a new production installation script to streamline the setup process for users, including detailed dry-run summaries for better visibility. - Enhanced validation mechanisms for configuration files and swarm keys, ensuring robust error handling and user feedback during setup. --- .cursor/rules/network.mdc | 18 +- CHANGELOG.md | 17 + Makefile | 2 +- pkg/cli/prod_commands.go | 775 +---------- pkg/cli/prod_commands_test.go | 4 +- pkg/cli/prod_install.go | 264 ++++ pkg/cli/utils/install.go | 97 ++ pkg/cli/utils/systemd.go | 217 +++ pkg/cli/utils/validation.go | 113 ++ pkg/config/validate.go | 13 + pkg/environments/development/ipfs.go | 287 ++++ pkg/environments/development/ipfs_cluster.go | 314 +++++ pkg/environments/development/process.go | 206 +++ pkg/environments/development/runner.go | 931 +------------ pkg/gateway/{ => auth}/jwt.go | 30 +- pkg/gateway/auth/service.go | 391 ++++++ pkg/gateway/auth_handlers.go | 697 ++-------- pkg/gateway/gateway.go | 47 +- pkg/gateway/jwt_test.go | 36 +- pkg/gateway/middleware.go | 5 +- pkg/gateway/routes.go | 4 +- pkg/installer/installer.go | 14 +- pkg/ipfs/cluster.go | 1171 +--------------- pkg/ipfs/cluster_config.go | 136 ++ pkg/ipfs/cluster_peer.go | 156 +++ pkg/ipfs/cluster_util.go | 119 ++ pkg/node/gateway.go | 204 +++ pkg/node/libp2p.go | 302 +++++ pkg/node/monitoring.go | 12 +- pkg/node/node.go | 1172 +--------------- pkg/node/rqlite.go | 98 ++ pkg/node/utils.go | 127 ++ pkg/rqlite/cluster.go | 301 +++++ pkg/rqlite/cluster_discovery.go | 860 ------------ pkg/rqlite/cluster_discovery_membership.go | 318 +++++ pkg/rqlite/cluster_discovery_queries.go | 251 ++++ pkg/rqlite/cluster_discovery_utils.go | 233 ++++ pkg/rqlite/discovery_manager.go | 61 + pkg/rqlite/process.go | 239 ++++ pkg/rqlite/rqlite.go | 1275 +----------------- pkg/rqlite/util.go | 58 + 41 files changed, 4814 insertions(+), 6761 deletions(-) create mode 100644 pkg/cli/prod_install.go create mode 100644 pkg/cli/utils/install.go create mode 100644 pkg/cli/utils/systemd.go create mode 100644 pkg/cli/utils/validation.go create mode 100644 pkg/environments/development/ipfs.go create mode 100644 pkg/environments/development/ipfs_cluster.go create mode 100644 pkg/environments/development/process.go rename pkg/gateway/{ => auth}/jwt.go (86%) create mode 100644 pkg/gateway/auth/service.go create mode 100644 pkg/ipfs/cluster_config.go create mode 100644 pkg/ipfs/cluster_peer.go create mode 100644 pkg/ipfs/cluster_util.go create mode 100644 pkg/node/gateway.go create mode 100644 pkg/node/libp2p.go create mode 100644 pkg/node/rqlite.go create mode 100644 pkg/node/utils.go create mode 100644 pkg/rqlite/cluster.go create mode 100644 pkg/rqlite/cluster_discovery_membership.go create mode 100644 pkg/rqlite/cluster_discovery_queries.go create mode 100644 pkg/rqlite/cluster_discovery_utils.go create mode 100644 pkg/rqlite/discovery_manager.go create mode 100644 pkg/rqlite/process.go create mode 100644 pkg/rqlite/util.go diff --git a/.cursor/rules/network.mdc b/.cursor/rules/network.mdc index 06b56ba..7e8075c 100644 --- a/.cursor/rules/network.mdc +++ b/.cursor/rules/network.mdc @@ -83,24 +83,10 @@ When learning a skill, follow this **collaborative, goal-oriented workflow**. Yo # Sonr Gateway (or Sonr Network Gateway) -This project implements a high-performance, multi-protocol API gateway designed to bridge client applications with a decentralized backend infrastructure. It serves as a unified entry point that handles secure user authentication via JWT, provides RESTful access to a distributed key-value cache (Olric), and facilitates decentralized storage interactions with IPFS. Beyond standard HTTP routing and reverse proxying, the gateway supports real-time communication through Pub/Sub mechanisms (WebSockets), mobile engagement via push notifications, and low-level traffic routing using TCP SNI (Server Name Indication) for encrypted service discovery. +This project implements a high-performance, multi-functional API gateway designed to bridge client applications with a decentralized infrastructure. It serves as a unified entry point for diverse services including distributed caching (via Olric), decentralized storage (IPFS), serverless function execution, and real-time pub/sub messaging. The gateway handles critical cross-cutting concerns such as JWT-based authentication, secure anonymous proxying, and mobile push notifications, ensuring that requests are validated, authorized, and efficiently routed across the network's ecosystem. -**Architecture:** Edge Gateway / Middleware Layer (part of a larger Distributed System) +**Architecture:** Edge Gateway / Middleware-heavy Microservice ## Tech Stack - **backend:** Go -## Patterns -- Reverse Proxy -- Middleware Chain -- Adapter Pattern (for storage/cache backends) -- and Observer Pattern (via Pub/Sub). - -## Domain Entities -- `JWT (Authentication Tokens)` -- `Namespaces (Resource Isolation)` -- `Pub/Sub Topics` -- `Distributed Cache (Olric)` -- `Push Notifications` -- `and SNI Routes.` - diff --git a/CHANGELOG.md b/CHANGELOG.md index a742e57..8e03532 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,23 @@ The format is based on [Keep a Changelog][keepachangelog] and adheres to [Semant ### Deprecated ### Fixed +## [0.81.0] - 2025-12-31 + +### Added +- Implemented a new, robust authentication service within the Gateway for handling wallet-based challenges, signature verification (ETH/SOL), JWT issuance, and API key management. +- Introduced automatic recovery logic for RQLite to detect and recover from split-brain scenarios and ensure cluster stability during restarts. + +### Changed +- Refactored the production installation command (`dbn prod install`) by moving installer logic and utility functions into a dedicated `pkg/cli/utils` package for better modularity and maintainability. +- Reworked the core logic for starting and managing LibP2P, RQLite, and the HTTP Gateway within the Node, including improved peer reconnection and cluster configuration synchronization. + +### Deprecated + +### Removed + +### Fixed +- Corrected IPFS Cluster configuration logic to properly handle port assignments and ensure correct IPFS API addresses are used, resolving potential connection issues between cluster components. + ## [0.80.0] - 2025-12-29 ### Added diff --git a/Makefile b/Makefile index bd2a042..b4b32b5 100644 --- a/Makefile +++ b/Makefile @@ -19,7 +19,7 @@ test-e2e: .PHONY: build clean test run-node run-node2 run-node3 run-example deps tidy fmt vet lint clear-ports install-hooks kill -VERSION := 0.80.0 +VERSION := 0.81.0 COMMIT ?= $(shell git rev-parse --short HEAD 2>/dev/null || echo unknown) DATE ?= $(shell date -u +%Y-%m-%dT%H:%M:%SZ) LDFLAGS := -X 'main.version=$(VERSION)' -X 'main.commit=$(COMMIT)' -X 'main.date=$(DATE)' diff --git a/pkg/cli/prod_commands.go b/pkg/cli/prod_commands.go index c825906..7e6a347 100644 --- a/pkg/cli/prod_commands.go +++ b/pkg/cli/prod_commands.go @@ -2,8 +2,6 @@ package cli import ( "bufio" - "encoding/hex" - "errors" "flag" "fmt" "net" @@ -11,269 +9,12 @@ import ( "os/exec" "path/filepath" "strings" - "syscall" "time" - "github.com/DeBrosOfficial/network/pkg/config" + "github.com/DeBrosOfficial/network/pkg/cli/utils" "github.com/DeBrosOfficial/network/pkg/environments/production" - "github.com/DeBrosOfficial/network/pkg/installer" - "github.com/multiformats/go-multiaddr" ) -// IPFSPeerInfo holds IPFS peer information for configuring Peering.Peers -type IPFSPeerInfo struct { - PeerID string - Addrs []string -} - -// IPFSClusterPeerInfo contains IPFS Cluster peer information for cluster discovery -type IPFSClusterPeerInfo struct { - PeerID string - Addrs []string -} - -// validateSwarmKey validates that a swarm key is 64 hex characters -func validateSwarmKey(key string) error { - key = strings.TrimSpace(key) - if len(key) != 64 { - return fmt.Errorf("swarm key must be 64 hex characters (32 bytes), got %d", len(key)) - } - if _, err := hex.DecodeString(key); err != nil { - return fmt.Errorf("swarm key must be valid hexadecimal: %w", err) - } - return nil -} - -// runInteractiveInstaller launches the TUI installer -func runInteractiveInstaller() { - config, err := installer.Run() - if err != nil { - fmt.Fprintf(os.Stderr, "āŒ %v\n", err) - os.Exit(1) - } - - // Convert TUI config to install args and run installation - var args []string - args = append(args, "--vps-ip", config.VpsIP) - args = append(args, "--domain", config.Domain) - args = append(args, "--branch", config.Branch) - - if config.NoPull { - args = append(args, "--no-pull") - } - - if !config.IsFirstNode { - if config.JoinAddress != "" { - args = append(args, "--join", config.JoinAddress) - } - if config.ClusterSecret != "" { - args = append(args, "--cluster-secret", config.ClusterSecret) - } - if config.SwarmKeyHex != "" { - args = append(args, "--swarm-key", config.SwarmKeyHex) - } - if len(config.Peers) > 0 { - args = append(args, "--peers", strings.Join(config.Peers, ",")) - } - // Pass IPFS peer info for Peering.Peers configuration - if config.IPFSPeerID != "" { - args = append(args, "--ipfs-peer", config.IPFSPeerID) - } - if len(config.IPFSSwarmAddrs) > 0 { - args = append(args, "--ipfs-addrs", strings.Join(config.IPFSSwarmAddrs, ",")) - } - // Pass IPFS Cluster peer info for cluster peer_addresses configuration - if config.IPFSClusterPeerID != "" { - args = append(args, "--ipfs-cluster-peer", config.IPFSClusterPeerID) - } - if len(config.IPFSClusterAddrs) > 0 { - args = append(args, "--ipfs-cluster-addrs", strings.Join(config.IPFSClusterAddrs, ",")) - } - } - - // Re-run with collected args - handleProdInstall(args) -} - -// showDryRunSummary displays what would be done during installation without making changes -func showDryRunSummary(vpsIP, domain, branch string, peers []string, joinAddress string, isFirstNode bool, oramaDir string) { - fmt.Print("\n" + strings.Repeat("=", 70) + "\n") - fmt.Printf("DRY RUN - No changes will be made\n") - fmt.Print(strings.Repeat("=", 70) + "\n\n") - - fmt.Printf("šŸ“‹ Installation Summary:\n") - fmt.Printf(" VPS IP: %s\n", vpsIP) - fmt.Printf(" Domain: %s\n", domain) - fmt.Printf(" Branch: %s\n", branch) - if isFirstNode { - fmt.Printf(" Node Type: First node (creates new cluster)\n") - } else { - fmt.Printf(" Node Type: Joining existing cluster\n") - if joinAddress != "" { - fmt.Printf(" Join Address: %s\n", joinAddress) - } - if len(peers) > 0 { - fmt.Printf(" Peers: %d peer(s)\n", len(peers)) - for _, peer := range peers { - fmt.Printf(" - %s\n", peer) - } - } - } - - fmt.Printf("\nšŸ“ Directories that would be created:\n") - fmt.Printf(" %s/configs/\n", oramaDir) - fmt.Printf(" %s/secrets/\n", oramaDir) - fmt.Printf(" %s/data/ipfs/repo/\n", oramaDir) - fmt.Printf(" %s/data/ipfs-cluster/\n", oramaDir) - fmt.Printf(" %s/data/rqlite/\n", oramaDir) - fmt.Printf(" %s/logs/\n", oramaDir) - fmt.Printf(" %s/tls-cache/\n", oramaDir) - - fmt.Printf("\nšŸ”§ Binaries that would be installed:\n") - fmt.Printf(" - Go (if not present)\n") - fmt.Printf(" - RQLite 8.43.0\n") - fmt.Printf(" - IPFS/Kubo 0.38.2\n") - fmt.Printf(" - IPFS Cluster (latest)\n") - fmt.Printf(" - Olric 0.7.0\n") - fmt.Printf(" - anyone-client (npm)\n") - fmt.Printf(" - DeBros binaries (built from %s branch)\n", branch) - - fmt.Printf("\nšŸ” Secrets that would be generated:\n") - fmt.Printf(" - Cluster secret (64-hex)\n") - fmt.Printf(" - IPFS swarm key\n") - fmt.Printf(" - Node identity (Ed25519 keypair)\n") - - fmt.Printf("\nšŸ“ Configuration files that would be created:\n") - fmt.Printf(" - %s/configs/node.yaml\n", oramaDir) - fmt.Printf(" - %s/configs/olric/config.yaml\n", oramaDir) - - fmt.Printf("\nāš™ļø Systemd services that would be created:\n") - fmt.Printf(" - debros-ipfs.service\n") - fmt.Printf(" - debros-ipfs-cluster.service\n") - fmt.Printf(" - debros-olric.service\n") - fmt.Printf(" - debros-node.service (includes embedded gateway + RQLite)\n") - fmt.Printf(" - debros-anyone-client.service\n") - - fmt.Printf("\n🌐 Ports that would be used:\n") - fmt.Printf(" External (must be open in firewall):\n") - fmt.Printf(" - 80 (HTTP for ACME/Let's Encrypt)\n") - fmt.Printf(" - 443 (HTTPS gateway)\n") - fmt.Printf(" - 4101 (IPFS swarm)\n") - fmt.Printf(" - 7001 (RQLite Raft)\n") - fmt.Printf(" Internal (localhost only):\n") - fmt.Printf(" - 4501 (IPFS API)\n") - fmt.Printf(" - 5001 (RQLite HTTP)\n") - fmt.Printf(" - 6001 (Unified gateway)\n") - fmt.Printf(" - 8080 (IPFS gateway)\n") - fmt.Printf(" - 9050 (Anyone SOCKS5)\n") - fmt.Printf(" - 9094 (IPFS Cluster API)\n") - fmt.Printf(" - 3320/3322 (Olric)\n") - - fmt.Print("\n" + strings.Repeat("=", 70) + "\n") - fmt.Printf("To proceed with installation, run without --dry-run\n") - fmt.Print(strings.Repeat("=", 70) + "\n\n") -} - -// validateGeneratedConfig loads and validates the generated node configuration -func validateGeneratedConfig(oramaDir string) error { - configPath := filepath.Join(oramaDir, "configs", "node.yaml") - - // Check if config file exists - if _, err := os.Stat(configPath); os.IsNotExist(err) { - return fmt.Errorf("configuration file not found at %s", configPath) - } - - // Load the config file - file, err := os.Open(configPath) - if err != nil { - return fmt.Errorf("failed to open config file: %w", err) - } - defer file.Close() - - var cfg config.Config - if err := config.DecodeStrict(file, &cfg); err != nil { - return fmt.Errorf("failed to parse config: %w", err) - } - - // Validate the configuration - if errs := cfg.Validate(); len(errs) > 0 { - var errMsgs []string - for _, e := range errs { - errMsgs = append(errMsgs, e.Error()) - } - return fmt.Errorf("configuration validation errors:\n - %s", strings.Join(errMsgs, "\n - ")) - } - - return nil -} - -// validateDNSRecord validates that the domain points to the expected IP address -// Returns nil if DNS is valid, warning message if DNS doesn't match but continues, -// or error if DNS lookup fails completely -func validateDNSRecord(domain, expectedIP string) error { - if domain == "" { - return nil // No domain provided, skip validation - } - - ips, err := net.LookupIP(domain) - if err != nil { - // DNS lookup failed - this is a warning, not a fatal error - // The user might be setting up DNS after installation - fmt.Printf(" āš ļø DNS lookup failed for %s: %v\n", domain, err) - fmt.Printf(" Make sure DNS is configured before enabling HTTPS\n") - return nil - } - - // Check if any resolved IP matches the expected IP - for _, ip := range ips { - if ip.String() == expectedIP { - fmt.Printf(" āœ“ DNS validated: %s → %s\n", domain, expectedIP) - return nil - } - } - - // DNS doesn't point to expected IP - warn but continue - resolvedIPs := make([]string, len(ips)) - for i, ip := range ips { - resolvedIPs[i] = ip.String() - } - fmt.Printf(" āš ļø DNS mismatch: %s resolves to %v, expected %s\n", domain, resolvedIPs, expectedIP) - fmt.Printf(" HTTPS certificate generation may fail until DNS is updated\n") - return nil -} - -// normalizePeers normalizes and validates peer multiaddrs -func normalizePeers(peersStr string) ([]string, error) { - if peersStr == "" { - return nil, nil - } - - // Split by comma and trim whitespace - rawPeers := strings.Split(peersStr, ",") - peers := make([]string, 0, len(rawPeers)) - seen := make(map[string]bool) - - for _, peer := range rawPeers { - peer = strings.TrimSpace(peer) - if peer == "" { - continue - } - - // Validate multiaddr format - if _, err := multiaddr.NewMultiaddr(peer); err != nil { - return nil, fmt.Errorf("invalid multiaddr %q: %w", peer, err) - } - - // Deduplicate - if !seen[peer] { - peers = append(peers, peer) - seen[peer] = true - } - } - - return peers, nil -} - // HandleProdCommand handles production environment commands func HandleProdCommand(args []string) { if len(args) == 0 { @@ -368,294 +109,6 @@ func showProdHelp() { fmt.Printf(" orama logs node --follow\n") } -func handleProdInstall(args []string) { - // Parse arguments using flag.FlagSet - fs := flag.NewFlagSet("install", flag.ContinueOnError) - fs.SetOutput(os.Stderr) - - force := fs.Bool("force", false, "Reconfigure all settings") - skipResourceChecks := fs.Bool("ignore-resource-checks", false, "Skip disk/RAM/CPU prerequisite validation") - vpsIP := fs.String("vps-ip", "", "VPS public IP address") - domain := fs.String("domain", "", "Domain for this node (e.g., node-123.orama.network)") - peersStr := fs.String("peers", "", "Comma-separated peer multiaddrs to connect to") - joinAddress := fs.String("join", "", "RQLite join address (IP:port) to join existing cluster") - branch := fs.String("branch", "main", "Git branch to use (main or nightly)") - clusterSecret := fs.String("cluster-secret", "", "Hex-encoded 32-byte cluster secret (for joining existing cluster)") - swarmKey := fs.String("swarm-key", "", "64-hex IPFS swarm key (for joining existing private network)") - ipfsPeerID := fs.String("ipfs-peer", "", "IPFS peer ID to connect to (auto-discovered from peer domain)") - ipfsAddrs := fs.String("ipfs-addrs", "", "Comma-separated IPFS swarm addresses (auto-discovered from peer domain)") - ipfsClusterPeerID := fs.String("ipfs-cluster-peer", "", "IPFS Cluster peer ID to connect to (auto-discovered from peer domain)") - ipfsClusterAddrs := fs.String("ipfs-cluster-addrs", "", "Comma-separated IPFS Cluster addresses (auto-discovered from peer domain)") - interactive := fs.Bool("interactive", false, "Run interactive TUI installer") - dryRun := fs.Bool("dry-run", false, "Show what would be done without making changes") - noPull := fs.Bool("no-pull", false, "Skip git clone/pull, use existing /home/debros/src") - - if err := fs.Parse(args); err != nil { - if err == flag.ErrHelp { - return - } - fmt.Fprintf(os.Stderr, "āŒ Failed to parse flags: %v\n", err) - os.Exit(1) - } - - // Launch TUI installer if --interactive flag or no required args provided - if *interactive || (*vpsIP == "" && len(args) == 0) { - runInteractiveInstaller() - return - } - - // Validate branch - if *branch != "main" && *branch != "nightly" { - fmt.Fprintf(os.Stderr, "āŒ Invalid branch: %s (must be 'main' or 'nightly')\n", *branch) - os.Exit(1) - } - - // Normalize and validate peers - peers, err := normalizePeers(*peersStr) - if err != nil { - fmt.Fprintf(os.Stderr, "āŒ Invalid peers: %v\n", err) - fmt.Fprintf(os.Stderr, " Example: --peers /ip4/10.0.0.1/tcp/4001/p2p/Qm...,/ip4/10.0.0.2/tcp/4001/p2p/Qm...\n") - os.Exit(1) - } - - // Validate setup requirements - if os.Geteuid() != 0 { - fmt.Fprintf(os.Stderr, "āŒ Production install must be run as root (use sudo)\n") - os.Exit(1) - } - - // Validate VPS IP is provided - if *vpsIP == "" { - fmt.Fprintf(os.Stderr, "āŒ --vps-ip is required\n") - fmt.Fprintf(os.Stderr, " Usage: sudo orama install --vps-ip \n") - fmt.Fprintf(os.Stderr, " Or run: sudo orama install --interactive\n") - os.Exit(1) - } - - // Determine if this is the first node (creates new cluster) or joining existing cluster - isFirstNode := len(peers) == 0 && *joinAddress == "" - if isFirstNode { - fmt.Printf("ā„¹ļø First node detected - will create new cluster\n") - } else { - fmt.Printf("ā„¹ļø Joining existing cluster\n") - // Cluster secret is required when joining - if *clusterSecret == "" { - fmt.Fprintf(os.Stderr, "āŒ --cluster-secret is required when joining an existing cluster\n") - fmt.Fprintf(os.Stderr, " Provide the 64-hex secret from an existing node (cat ~/.orama/secrets/cluster-secret)\n") - os.Exit(1) - } - if err := production.ValidateClusterSecret(*clusterSecret); err != nil { - fmt.Fprintf(os.Stderr, "āŒ Invalid --cluster-secret: %v\n", err) - os.Exit(1) - } - // Swarm key is required when joining - if *swarmKey == "" { - fmt.Fprintf(os.Stderr, "āŒ --swarm-key is required when joining an existing cluster\n") - fmt.Fprintf(os.Stderr, " Provide the 64-hex swarm key from an existing node:\n") - fmt.Fprintf(os.Stderr, " cat ~/.orama/secrets/swarm.key | tail -1\n") - os.Exit(1) - } - if err := validateSwarmKey(*swarmKey); err != nil { - fmt.Fprintf(os.Stderr, "āŒ Invalid --swarm-key: %v\n", err) - os.Exit(1) - } - } - - oramaHome := "/home/debros" - oramaDir := oramaHome + "/.orama" - - // If cluster secret was provided, save it to secrets directory before setup - if *clusterSecret != "" { - secretsDir := filepath.Join(oramaDir, "secrets") - if err := os.MkdirAll(secretsDir, 0755); err != nil { - fmt.Fprintf(os.Stderr, "āŒ Failed to create secrets directory: %v\n", err) - os.Exit(1) - } - secretPath := filepath.Join(secretsDir, "cluster-secret") - if err := os.WriteFile(secretPath, []byte(*clusterSecret), 0600); err != nil { - fmt.Fprintf(os.Stderr, "āŒ Failed to save cluster secret: %v\n", err) - os.Exit(1) - } - fmt.Printf(" āœ“ Cluster secret saved\n") - } - - // If swarm key was provided, save it to secrets directory in full format - if *swarmKey != "" { - secretsDir := filepath.Join(oramaDir, "secrets") - if err := os.MkdirAll(secretsDir, 0755); err != nil { - fmt.Fprintf(os.Stderr, "āŒ Failed to create secrets directory: %v\n", err) - os.Exit(1) - } - // Convert 64-hex key to full swarm.key format - swarmKeyContent := fmt.Sprintf("/key/swarm/psk/1.0.0/\n/base16/\n%s\n", strings.ToUpper(*swarmKey)) - swarmKeyPath := filepath.Join(secretsDir, "swarm.key") - if err := os.WriteFile(swarmKeyPath, []byte(swarmKeyContent), 0600); err != nil { - fmt.Fprintf(os.Stderr, "āŒ Failed to save swarm key: %v\n", err) - os.Exit(1) - } - fmt.Printf(" āœ“ Swarm key saved\n") - } - - // Store IPFS peer info for later use in IPFS configuration - var ipfsPeerInfo *IPFSPeerInfo - if *ipfsPeerID != "" && *ipfsAddrs != "" { - ipfsPeerInfo = &IPFSPeerInfo{ - PeerID: *ipfsPeerID, - Addrs: strings.Split(*ipfsAddrs, ","), - } - } - - // Store IPFS Cluster peer info for cluster peer discovery - var ipfsClusterPeerInfo *IPFSClusterPeerInfo - if *ipfsClusterPeerID != "" { - var addrs []string - if *ipfsClusterAddrs != "" { - addrs = strings.Split(*ipfsClusterAddrs, ",") - } - ipfsClusterPeerInfo = &IPFSClusterPeerInfo{ - PeerID: *ipfsClusterPeerID, - Addrs: addrs, - } - } - - setup := production.NewProductionSetup(oramaHome, os.Stdout, *force, *branch, *noPull, *skipResourceChecks) - - // Inform user if skipping git pull - if *noPull { - fmt.Printf(" āš ļø --no-pull flag enabled: Skipping git clone/pull\n") - fmt.Printf(" Using existing repository at /home/debros/src\n") - } - - // Check port availability before proceeding - if err := ensurePortsAvailable("install", defaultPorts()); err != nil { - fmt.Fprintf(os.Stderr, "āŒ %v\n", err) - os.Exit(1) - } - - // Validate DNS if domain is provided - if *domain != "" { - fmt.Printf("\n🌐 Pre-flight DNS validation...\n") - validateDNSRecord(*domain, *vpsIP) - } - - // Dry-run mode: show what would be done and exit - if *dryRun { - showDryRunSummary(*vpsIP, *domain, *branch, peers, *joinAddress, isFirstNode, oramaDir) - return - } - - // Save branch preference for future upgrades - if err := production.SaveBranchPreference(oramaDir, *branch); err != nil { - fmt.Fprintf(os.Stderr, "āš ļø Warning: Failed to save branch preference: %v\n", err) - } - - // Phase 1: Check prerequisites - fmt.Printf("\nšŸ“‹ Phase 1: Checking prerequisites...\n") - if err := setup.Phase1CheckPrerequisites(); err != nil { - fmt.Fprintf(os.Stderr, "āŒ Prerequisites check failed: %v\n", err) - os.Exit(1) - } - - // Phase 2: Provision environment - fmt.Printf("\nšŸ› ļø Phase 2: Provisioning environment...\n") - if err := setup.Phase2ProvisionEnvironment(); err != nil { - fmt.Fprintf(os.Stderr, "āŒ Environment provisioning failed: %v\n", err) - os.Exit(1) - } - - // Phase 2b: Install binaries - fmt.Printf("\nPhase 2b: Installing binaries...\n") - if err := setup.Phase2bInstallBinaries(); err != nil { - fmt.Fprintf(os.Stderr, "āŒ Binary installation failed: %v\n", err) - os.Exit(1) - } - - // Phase 3: Generate secrets FIRST (before service initialization) - // This ensures cluster secret and swarm key exist before repos are seeded - fmt.Printf("\nšŸ” Phase 3: Generating secrets...\n") - if err := setup.Phase3GenerateSecrets(); err != nil { - fmt.Fprintf(os.Stderr, "āŒ Secret generation failed: %v\n", err) - os.Exit(1) - } - - // Phase 4: Generate configs (BEFORE service initialization) - // This ensures node.yaml exists before services try to access it - fmt.Printf("\nāš™ļø Phase 4: Generating configurations...\n") - enableHTTPS := *domain != "" - if err := setup.Phase4GenerateConfigs(peers, *vpsIP, enableHTTPS, *domain, *joinAddress); err != nil { - fmt.Fprintf(os.Stderr, "āŒ Configuration generation failed: %v\n", err) - os.Exit(1) - } - - // Validate generated configuration - fmt.Printf(" Validating generated configuration...\n") - if err := validateGeneratedConfig(oramaDir); err != nil { - fmt.Fprintf(os.Stderr, "āŒ Configuration validation failed: %v\n", err) - os.Exit(1) - } - fmt.Printf(" āœ“ Configuration validated\n") - - // Phase 2c: Initialize services (after config is in place) - fmt.Printf("\nPhase 2c: Initializing services...\n") - var prodIPFSPeer *production.IPFSPeerInfo - if ipfsPeerInfo != nil { - prodIPFSPeer = &production.IPFSPeerInfo{ - PeerID: ipfsPeerInfo.PeerID, - Addrs: ipfsPeerInfo.Addrs, - } - } - var prodIPFSClusterPeer *production.IPFSClusterPeerInfo - if ipfsClusterPeerInfo != nil { - prodIPFSClusterPeer = &production.IPFSClusterPeerInfo{ - PeerID: ipfsClusterPeerInfo.PeerID, - Addrs: ipfsClusterPeerInfo.Addrs, - } - } - if err := setup.Phase2cInitializeServices(peers, *vpsIP, prodIPFSPeer, prodIPFSClusterPeer); err != nil { - fmt.Fprintf(os.Stderr, "āŒ Service initialization failed: %v\n", err) - os.Exit(1) - } - - // Phase 5: Create systemd services - fmt.Printf("\nšŸ”§ Phase 5: Creating systemd services...\n") - if err := setup.Phase5CreateSystemdServices(enableHTTPS); err != nil { - fmt.Fprintf(os.Stderr, "āŒ Service creation failed: %v\n", err) - os.Exit(1) - } - - // Log completion with actual peer ID - setup.LogSetupComplete(setup.NodePeerID) - fmt.Printf("āœ… Production installation complete!\n\n") - - // For first node, print important secrets and identifiers - if isFirstNode { - fmt.Printf("šŸ“‹ Save these for joining future nodes:\n\n") - - // Print cluster secret - clusterSecretPath := filepath.Join(oramaDir, "secrets", "cluster-secret") - if clusterSecretData, err := os.ReadFile(clusterSecretPath); err == nil { - fmt.Printf(" Cluster Secret (--cluster-secret):\n") - fmt.Printf(" %s\n\n", string(clusterSecretData)) - } - - // Print swarm key - swarmKeyPath := filepath.Join(oramaDir, "secrets", "swarm.key") - if swarmKeyData, err := os.ReadFile(swarmKeyPath); err == nil { - swarmKeyContent := strings.TrimSpace(string(swarmKeyData)) - lines := strings.Split(swarmKeyContent, "\n") - if len(lines) >= 3 { - // Extract just the hex part (last line) - fmt.Printf(" IPFS Swarm Key (--swarm-key, last line only):\n") - fmt.Printf(" %s\n\n", lines[len(lines)-1]) - } - } - - // Print peer ID - fmt.Printf(" Node Peer ID:\n") - fmt.Printf(" %s\n\n", setup.NodePeerID) - } -} - func handleProdUpgrade(args []string) { // Parse arguments using flag.FlagSet fs := flag.NewFlagSet("upgrade", flag.ContinueOnError) @@ -767,7 +220,7 @@ func handleProdUpgrade(args []string) { } // Check port availability after stopping services - if err := ensurePortsAvailable("prod upgrade", defaultPorts()); err != nil { + if err := utils.EnsurePortsAvailable("prod upgrade", utils.DefaultPorts()); err != nil { fmt.Fprintf(os.Stderr, "āŒ %v\n", err) os.Exit(1) } @@ -945,7 +398,7 @@ func handleProdUpgrade(args []string) { fmt.Fprintf(os.Stderr, " āš ļø Warning: Failed to reload systemd daemon: %v\n", err) } // Restart services to apply changes - use getProductionServices to only restart existing services - services := getProductionServices() + services := utils.GetProductionServices() if len(services) == 0 { fmt.Printf(" āš ļø No services found to restart\n") } else { @@ -991,10 +444,9 @@ func handleProdStatus() { fmt.Printf("Services:\n") found := false for _, svc := range serviceNames { - cmd := exec.Command("systemctl", "is-active", "--quiet", svc) - err := cmd.Run() + active, _ := utils.IsServiceActive(svc) status := "āŒ Inactive" - if err == nil { + if active { status = "āœ… Active" found = true } @@ -1016,52 +468,6 @@ func handleProdStatus() { fmt.Printf("\nView logs with: dbn prod logs \n") } -// resolveServiceName resolves service aliases to actual systemd service names -func resolveServiceName(alias string) ([]string, error) { - // Service alias mapping (unified - no bootstrap/node distinction) - aliases := map[string][]string{ - "node": {"debros-node"}, - "ipfs": {"debros-ipfs"}, - "cluster": {"debros-ipfs-cluster"}, - "ipfs-cluster": {"debros-ipfs-cluster"}, - "gateway": {"debros-gateway"}, - "olric": {"debros-olric"}, - "rqlite": {"debros-node"}, // RQLite logs are in node logs - } - - // Check if it's an alias - if serviceNames, ok := aliases[strings.ToLower(alias)]; ok { - // Filter to only existing services - var existing []string - for _, svc := range serviceNames { - unitPath := filepath.Join("/etc/systemd/system", svc+".service") - if _, err := os.Stat(unitPath); err == nil { - existing = append(existing, svc) - } - } - if len(existing) == 0 { - return nil, fmt.Errorf("no services found for alias %q", alias) - } - return existing, nil - } - - // Check if it's already a full service name - unitPath := filepath.Join("/etc/systemd/system", alias+".service") - if _, err := os.Stat(unitPath); err == nil { - return []string{alias}, nil - } - - // Try without .service suffix - if !strings.HasSuffix(alias, ".service") { - unitPath = filepath.Join("/etc/systemd/system", alias+".service") - if _, err := os.Stat(unitPath); err == nil { - return []string{alias}, nil - } - } - - return nil, fmt.Errorf("service %q not found. Use: node, ipfs, cluster, gateway, olric, or full service name", alias) -} - func handleProdLogs(args []string) { if len(args) == 0 { fmt.Fprintf(os.Stderr, "Usage: dbn prod logs [--follow]\n") @@ -1079,7 +485,7 @@ func handleProdLogs(args []string) { } // Resolve service alias to actual service names - serviceNames, err := resolveServiceName(serviceAlias) + serviceNames, err := utils.ResolveServiceName(serviceAlias) if err != nil { fmt.Fprintf(os.Stderr, "āŒ %v\n", err) fmt.Fprintf(os.Stderr, "\nAvailable service aliases: node, ipfs, cluster, gateway, olric\n") @@ -1138,145 +544,6 @@ func handleProdLogs(args []string) { } } -// errServiceNotFound marks units that systemd does not know about. -var errServiceNotFound = errors.New("service not found") - -type portSpec struct { - Name string - Port int -} - -var servicePorts = map[string][]portSpec{ - "debros-gateway": {{"Gateway API", 6001}}, - "debros-olric": {{"Olric HTTP", 3320}, {"Olric Memberlist", 3322}}, - "debros-node": {{"RQLite HTTP", 5001}, {"RQLite Raft", 7001}}, - "debros-ipfs": {{"IPFS API", 4501}, {"IPFS Gateway", 8080}, {"IPFS Swarm", 4101}}, - "debros-ipfs-cluster": {{"IPFS Cluster API", 9094}}, -} - -// defaultPorts is used for fresh installs/upgrades before unit files exist. -func defaultPorts() []portSpec { - return []portSpec{ - {"IPFS Swarm", 4001}, - {"IPFS API", 4501}, - {"IPFS Gateway", 8080}, - {"Gateway API", 6001}, - {"RQLite HTTP", 5001}, - {"RQLite Raft", 7001}, - {"IPFS Cluster API", 9094}, - {"Olric HTTP", 3320}, - {"Olric Memberlist", 3322}, - } -} - -func isServiceActive(service string) (bool, error) { - cmd := exec.Command("systemctl", "is-active", "--quiet", service) - if err := cmd.Run(); err != nil { - if exitErr, ok := err.(*exec.ExitError); ok { - switch exitErr.ExitCode() { - case 3: - return false, nil - case 4: - return false, errServiceNotFound - } - } - return false, err - } - return true, nil -} - -func isServiceEnabled(service string) (bool, error) { - cmd := exec.Command("systemctl", "is-enabled", "--quiet", service) - if err := cmd.Run(); err != nil { - if exitErr, ok := err.(*exec.ExitError); ok { - switch exitErr.ExitCode() { - case 1: - return false, nil // Service is disabled - case 4: - return false, errServiceNotFound - } - } - return false, err - } - return true, nil -} - -func collectPortsForServices(services []string, skipActive bool) ([]portSpec, error) { - seen := make(map[int]portSpec) - for _, svc := range services { - if skipActive { - active, err := isServiceActive(svc) - if err != nil { - return nil, fmt.Errorf("unable to check %s: %w", svc, err) - } - if active { - continue - } - } - for _, spec := range servicePorts[svc] { - if _, ok := seen[spec.Port]; !ok { - seen[spec.Port] = spec - } - } - } - ports := make([]portSpec, 0, len(seen)) - for _, spec := range seen { - ports = append(ports, spec) - } - return ports, nil -} - -func ensurePortsAvailable(action string, ports []portSpec) error { - for _, spec := range ports { - ln, err := net.Listen("tcp", fmt.Sprintf("0.0.0.0:%d", spec.Port)) - if err != nil { - if errors.Is(err, syscall.EADDRINUSE) || strings.Contains(err.Error(), "address already in use") { - return fmt.Errorf("%s cannot continue: %s (port %d) is already in use", action, spec.Name, spec.Port) - } - return fmt.Errorf("%s cannot continue: failed to inspect %s (port %d): %w", action, spec.Name, spec.Port, err) - } - _ = ln.Close() - } - return nil -} - -// getProductionServices returns a list of all DeBros production service names that exist -func getProductionServices() []string { - // Unified service names (no bootstrap/node distinction) - allServices := []string{ - "debros-gateway", - "debros-node", - "debros-olric", - "debros-ipfs-cluster", - "debros-ipfs", - "debros-anyone-client", - } - - // Filter to only existing services by checking if unit file exists - var existing []string - for _, svc := range allServices { - unitPath := filepath.Join("/etc/systemd/system", svc+".service") - if _, err := os.Stat(unitPath); err == nil { - existing = append(existing, svc) - } - } - - return existing -} - -func isServiceMasked(service string) (bool, error) { - cmd := exec.Command("systemctl", "is-enabled", service) - output, err := cmd.CombinedOutput() - if err != nil { - outputStr := string(output) - if strings.Contains(outputStr, "masked") { - return true, nil - } - return false, err - } - return false, nil -} - func handleProdStart() { if os.Geteuid() != 0 { fmt.Fprintf(os.Stderr, "āŒ Production commands must be run as root (use sudo)\n") @@ -1285,7 +552,7 @@ func handleProdStart() { fmt.Printf("Starting all DeBros production services...\n") - services := getProductionServices() + services := utils.GetProductionServices() if len(services) == 0 { fmt.Printf(" āš ļø No DeBros services found\n") return @@ -1301,7 +568,7 @@ func handleProdStart() { inactive := make([]string, 0, len(services)) for _, svc := range services { // Check if service is masked and unmask it - masked, err := isServiceMasked(svc) + masked, err := utils.IsServiceMasked(svc) if err == nil && masked { fmt.Printf(" āš ļø %s is masked, unmasking...\n", svc) if err := exec.Command("systemctl", "unmask", svc).Run(); err != nil { @@ -1311,7 +578,7 @@ func handleProdStart() { } } - active, err := isServiceActive(svc) + active, err := utils.IsServiceActive(svc) if err != nil { fmt.Printf(" āš ļø Unable to check %s: %v\n", svc, err) continue @@ -1319,7 +586,7 @@ func handleProdStart() { if active { fmt.Printf(" ā„¹ļø %s already running\n", svc) // Re-enable if disabled (in case it was stopped with 'dbn prod stop') - enabled, err := isServiceEnabled(svc) + enabled, err := utils.IsServiceEnabled(svc) if err == nil && !enabled { if err := exec.Command("systemctl", "enable", svc).Run(); err != nil { fmt.Printf(" āš ļø Failed to re-enable %s: %v\n", svc, err) @@ -1338,12 +605,12 @@ func handleProdStart() { } // Check port availability for services we're about to start - ports, err := collectPortsForServices(inactive, false) + ports, err := utils.CollectPortsForServices(inactive, false) if err != nil { fmt.Fprintf(os.Stderr, "āŒ %v\n", err) os.Exit(1) } - if err := ensurePortsAvailable("prod start", ports); err != nil { + if err := utils.EnsurePortsAvailable("prod start", ports); err != nil { fmt.Fprintf(os.Stderr, "āŒ %v\n", err) os.Exit(1) } @@ -1351,7 +618,7 @@ func handleProdStart() { // Enable and start inactive services for _, svc := range inactive { // Re-enable the service first (in case it was disabled by 'dbn prod stop') - enabled, err := isServiceEnabled(svc) + enabled, err := utils.IsServiceEnabled(svc) if err == nil && !enabled { if err := exec.Command("systemctl", "enable", svc).Run(); err != nil { fmt.Printf(" āš ļø Failed to enable %s: %v\n", svc, err) @@ -1385,7 +652,7 @@ func handleProdStop() { fmt.Printf("Stopping all DeBros production services...\n") - services := getProductionServices() + services := utils.GetProductionServices() if len(services) == 0 { fmt.Printf(" āš ļø No DeBros services found\n") return @@ -1424,7 +691,7 @@ func handleProdStop() { hadError := false for _, svc := range services { - active, err := isServiceActive(svc) + active, err := utils.IsServiceActive(svc) if err != nil { fmt.Printf(" āš ļø Unable to check %s: %v\n", svc, err) hadError = true @@ -1441,7 +708,7 @@ func handleProdStop() { } else { // Wait and verify again time.Sleep(1 * time.Second) - if stillActive, _ := isServiceActive(svc); stillActive { + if stillActive, _ := utils.IsServiceActive(svc); stillActive { fmt.Printf(" āŒ %s restarted itself (Restart=always)\n", svc) hadError = true } else { @@ -1451,7 +718,7 @@ func handleProdStop() { } // Disable the service to prevent it from auto-starting on boot - enabled, err := isServiceEnabled(svc) + enabled, err := utils.IsServiceEnabled(svc) if err != nil { fmt.Printf(" āš ļø Unable to check if %s is enabled: %v\n", svc, err) // Continue anyway - try to disable @@ -1486,7 +753,7 @@ func handleProdRestart() { fmt.Printf("Restarting all DeBros production services...\n") - services := getProductionServices() + services := utils.GetProductionServices() if len(services) == 0 { fmt.Printf(" āš ļø No DeBros services found\n") return @@ -1495,7 +762,7 @@ func handleProdRestart() { // Stop all active services first fmt.Printf(" Stopping services...\n") for _, svc := range services { - active, err := isServiceActive(svc) + active, err := utils.IsServiceActive(svc) if err != nil { fmt.Printf(" āš ļø Unable to check %s: %v\n", svc, err) continue @@ -1512,12 +779,12 @@ func handleProdRestart() { } // Check port availability before restarting - ports, err := collectPortsForServices(services, false) + ports, err := utils.CollectPortsForServices(services, false) if err != nil { fmt.Fprintf(os.Stderr, "āŒ %v\n", err) os.Exit(1) } - if err := ensurePortsAvailable("prod restart", ports); err != nil { + if err := utils.EnsurePortsAvailable("prod restart", ports); err != nil { fmt.Fprintf(os.Stderr, "āŒ %v\n", err) os.Exit(1) } diff --git a/pkg/cli/prod_commands_test.go b/pkg/cli/prod_commands_test.go index 926d589..c67e617 100644 --- a/pkg/cli/prod_commands_test.go +++ b/pkg/cli/prod_commands_test.go @@ -2,6 +2,8 @@ package cli import ( "testing" + + "github.com/DeBrosOfficial/network/pkg/cli/utils" ) // TestProdCommandFlagParsing verifies that prod command flags are parsed correctly @@ -156,7 +158,7 @@ func TestNormalizePeers(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - peers, err := normalizePeers(tt.input) + peers, err := utils.NormalizePeers(tt.input) if tt.expectError && err == nil { t.Errorf("expected error but got none") diff --git a/pkg/cli/prod_install.go b/pkg/cli/prod_install.go new file mode 100644 index 0000000..9f53907 --- /dev/null +++ b/pkg/cli/prod_install.go @@ -0,0 +1,264 @@ +package cli + +import ( + "flag" + "fmt" + "os" + "path/filepath" + "strings" + + "github.com/DeBrosOfficial/network/pkg/cli/utils" + "github.com/DeBrosOfficial/network/pkg/environments/production" +) + +func handleProdInstall(args []string) { + // Parse arguments using flag.FlagSet + fs := flag.NewFlagSet("install", flag.ContinueOnError) + fs.SetOutput(os.Stderr) + + vpsIP := fs.String("vps-ip", "", "Public IP of this VPS (required)") + domain := fs.String("domain", "", "Domain name for HTTPS (optional, e.g. gateway.example.com)") + branch := fs.String("branch", "main", "Git branch to use (main or nightly)") + noPull := fs.Bool("no-pull", false, "Skip git clone/pull, use existing repository in /home/debros/src") + force := fs.Bool("force", false, "Force reconfiguration even if already installed") + dryRun := fs.Bool("dry-run", false, "Show what would be done without making changes") + skipResourceChecks := fs.Bool("skip-checks", false, "Skip minimum resource checks (RAM/CPU)") + + // Cluster join flags + joinAddress := fs.String("join", "", "Join an existing cluster (e.g. 1.2.3.4:7001)") + clusterSecret := fs.String("cluster-secret", "", "Cluster secret for IPFS Cluster (required if joining)") + swarmKey := fs.String("swarm-key", "", "IPFS Swarm key (required if joining)") + peersStr := fs.String("peers", "", "Comma-separated list of bootstrap peer multiaddrs") + + // IPFS/Cluster specific info for Peering configuration + ipfsPeerID := fs.String("ipfs-peer", "", "Peer ID of existing IPFS node to peer with") + ipfsAddrs := fs.String("ipfs-addrs", "", "Comma-separated multiaddrs of existing IPFS node") + ipfsClusterPeerID := fs.String("ipfs-cluster-peer", "", "Peer ID of existing IPFS Cluster node") + ipfsClusterAddrs := fs.String("ipfs-cluster-addrs", "", "Comma-separated multiaddrs of existing IPFS Cluster node") + + if err := fs.Parse(args); err != nil { + if err == flag.ErrHelp { + return + } + fmt.Fprintf(os.Stderr, "āŒ Failed to parse flags: %v\n", err) + os.Exit(1) + } + + // Validate required flags + if *vpsIP == "" && !*dryRun { + fmt.Fprintf(os.Stderr, "āŒ Error: --vps-ip is required for installation\n") + fmt.Fprintf(os.Stderr, " Example: dbn prod install --vps-ip 1.2.3.4\n") + os.Exit(1) + } + + if os.Geteuid() != 0 && !*dryRun { + fmt.Fprintf(os.Stderr, "āŒ Production installation must be run as root (use sudo)\n") + os.Exit(1) + } + + oramaHome := "/home/debros" + oramaDir := oramaHome + "/.orama" + fmt.Printf("šŸš€ Starting production installation...\n\n") + + isFirstNode := *joinAddress == "" + peers, err := utils.NormalizePeers(*peersStr) + if err != nil { + fmt.Fprintf(os.Stderr, "āŒ Invalid peers: %v\n", err) + os.Exit(1) + } + + // If cluster secret was provided, save it to secrets directory before setup + if *clusterSecret != "" { + secretsDir := filepath.Join(oramaDir, "secrets") + if err := os.MkdirAll(secretsDir, 0755); err != nil { + fmt.Fprintf(os.Stderr, "āŒ Failed to create secrets directory: %v\n", err) + os.Exit(1) + } + secretPath := filepath.Join(secretsDir, "cluster-secret") + if err := os.WriteFile(secretPath, []byte(*clusterSecret), 0600); err != nil { + fmt.Fprintf(os.Stderr, "āŒ Failed to save cluster secret: %v\n", err) + os.Exit(1) + } + fmt.Printf(" āœ“ Cluster secret saved\n") + } + + // If swarm key was provided, save it to secrets directory in full format + if *swarmKey != "" { + secretsDir := filepath.Join(oramaDir, "secrets") + if err := os.MkdirAll(secretsDir, 0755); err != nil { + fmt.Fprintf(os.Stderr, "āŒ Failed to create secrets directory: %v\n", err) + os.Exit(1) + } + // Convert 64-hex key to full swarm.key format + swarmKeyContent := fmt.Sprintf("/key/swarm/psk/1.0.0/\n/base16/\n%s\n", strings.ToUpper(*swarmKey)) + swarmKeyPath := filepath.Join(secretsDir, "swarm.key") + if err := os.WriteFile(swarmKeyPath, []byte(swarmKeyContent), 0600); err != nil { + fmt.Fprintf(os.Stderr, "āŒ Failed to save swarm key: %v\n", err) + os.Exit(1) + } + fmt.Printf(" āœ“ Swarm key saved\n") + } + + // Store IPFS peer info for peering + var ipfsPeerInfo *utils.IPFSPeerInfo + if *ipfsPeerID != "" { + var addrs []string + if *ipfsAddrs != "" { + addrs = strings.Split(*ipfsAddrs, ",") + } + ipfsPeerInfo = &utils.IPFSPeerInfo{ + PeerID: *ipfsPeerID, + Addrs: addrs, + } + } + + // Store IPFS Cluster peer info for cluster peer discovery + var ipfsClusterPeerInfo *utils.IPFSClusterPeerInfo + if *ipfsClusterPeerID != "" { + var addrs []string + if *ipfsClusterAddrs != "" { + addrs = strings.Split(*ipfsClusterAddrs, ",") + } + ipfsClusterPeerInfo = &utils.IPFSClusterPeerInfo{ + PeerID: *ipfsClusterPeerID, + Addrs: addrs, + } + } + + setup := production.NewProductionSetup(oramaHome, os.Stdout, *force, *branch, *noPull, *skipResourceChecks) + + // Inform user if skipping git pull + if *noPull { + fmt.Printf(" āš ļø --no-pull flag enabled: Skipping git clone/pull\n") + fmt.Printf(" Using existing repository at /home/debros/src\n") + } + + // Check port availability before proceeding + if err := utils.EnsurePortsAvailable("install", utils.DefaultPorts()); err != nil { + fmt.Fprintf(os.Stderr, "āŒ %v\n", err) + os.Exit(1) + } + + // Validate DNS if domain is provided + if *domain != "" { + fmt.Printf("\n🌐 Pre-flight DNS validation...\n") + utils.ValidateDNSRecord(*domain, *vpsIP) + } + + // Dry-run mode: show what would be done and exit + if *dryRun { + utils.ShowDryRunSummary(*vpsIP, *domain, *branch, peers, *joinAddress, isFirstNode, oramaDir) + return + } + + // Save branch preference for future upgrades + if err := production.SaveBranchPreference(oramaDir, *branch); err != nil { + fmt.Fprintf(os.Stderr, "āš ļø Warning: Failed to save branch preference: %v\n", err) + } + + // Phase 1: Check prerequisites + fmt.Printf("\nšŸ“‹ Phase 1: Checking prerequisites...\n") + if err := setup.Phase1CheckPrerequisites(); err != nil { + fmt.Fprintf(os.Stderr, "āŒ Prerequisites check failed: %v\n", err) + os.Exit(1) + } + + // Phase 2: Provision environment + fmt.Printf("\nšŸ› ļø Phase 2: Provisioning environment...\n") + if err := setup.Phase2ProvisionEnvironment(); err != nil { + fmt.Fprintf(os.Stderr, "āŒ Environment provisioning failed: %v\n", err) + os.Exit(1) + } + + // Phase 2b: Install binaries + fmt.Printf("\nPhase 2b: Installing binaries...\n") + if err := setup.Phase2bInstallBinaries(); err != nil { + fmt.Fprintf(os.Stderr, "āŒ Binary installation failed: %v\n", err) + os.Exit(1) + } + + // Phase 3: Generate secrets FIRST (before service initialization) + // This ensures cluster secret and swarm key exist before repos are seeded + fmt.Printf("\nšŸ” Phase 3: Generating secrets...\n") + if err := setup.Phase3GenerateSecrets(); err != nil { + fmt.Fprintf(os.Stderr, "āŒ Secret generation failed: %v\n", err) + os.Exit(1) + } + + // Phase 4: Generate configs (BEFORE service initialization) + // This ensures node.yaml exists before services try to access it + fmt.Printf("\nāš™ļø Phase 4: Generating configurations...\n") + enableHTTPS := *domain != "" + if err := setup.Phase4GenerateConfigs(peers, *vpsIP, enableHTTPS, *domain, *joinAddress); err != nil { + fmt.Fprintf(os.Stderr, "āŒ Configuration generation failed: %v\n", err) + os.Exit(1) + } + + // Validate generated configuration + fmt.Printf(" Validating generated configuration...\n") + if err := utils.ValidateGeneratedConfig(oramaDir); err != nil { + fmt.Fprintf(os.Stderr, "āŒ Configuration validation failed: %v\n", err) + os.Exit(1) + } + fmt.Printf(" āœ“ Configuration validated\n") + + // Phase 2c: Initialize services (after config is in place) + fmt.Printf("\nPhase 2c: Initializing services...\n") + var prodIPFSPeer *production.IPFSPeerInfo + if ipfsPeerInfo != nil { + prodIPFSPeer = &production.IPFSPeerInfo{ + PeerID: ipfsPeerInfo.PeerID, + Addrs: ipfsPeerInfo.Addrs, + } + } + var prodIPFSClusterPeer *production.IPFSClusterPeerInfo + if ipfsClusterPeerInfo != nil { + prodIPFSClusterPeer = &production.IPFSClusterPeerInfo{ + PeerID: ipfsClusterPeerInfo.PeerID, + Addrs: ipfsClusterPeerInfo.Addrs, + } + } + if err := setup.Phase2cInitializeServices(peers, *vpsIP, prodIPFSPeer, prodIPFSClusterPeer); err != nil { + fmt.Fprintf(os.Stderr, "āŒ Service initialization failed: %v\n", err) + os.Exit(1) + } + + // Phase 5: Create systemd services + fmt.Printf("\nšŸ”§ Phase 5: Creating systemd services...\n") + if err := setup.Phase5CreateSystemdServices(enableHTTPS); err != nil { + fmt.Fprintf(os.Stderr, "āŒ Service creation failed: %v\n", err) + os.Exit(1) + } + + // Log completion with actual peer ID + setup.LogSetupComplete(setup.NodePeerID) + fmt.Printf("āœ… Production installation complete!\n\n") + + // For first node, print important secrets and identifiers + if isFirstNode { + fmt.Printf("šŸ“‹ Save these for joining future nodes:\n\n") + + // Print cluster secret + clusterSecretPath := filepath.Join(oramaDir, "secrets", "cluster-secret") + if clusterSecretData, err := os.ReadFile(clusterSecretPath); err == nil { + fmt.Printf(" Cluster Secret (--cluster-secret):\n") + fmt.Printf(" %s\n\n", string(clusterSecretData)) + } + + // Print swarm key + swarmKeyPath := filepath.Join(oramaDir, "secrets", "swarm.key") + if swarmKeyData, err := os.ReadFile(swarmKeyPath); err == nil { + swarmKeyContent := strings.TrimSpace(string(swarmKeyData)) + lines := strings.Split(swarmKeyContent, "\n") + if len(lines) >= 3 { + // Extract just the hex part (last line) + fmt.Printf(" IPFS Swarm Key (--swarm-key, last line only):\n") + fmt.Printf(" %s\n\n", lines[len(lines)-1]) + } + } + + // Print peer ID + fmt.Printf(" Node Peer ID:\n") + fmt.Printf(" %s\n\n", setup.NodePeerID) + } +} diff --git a/pkg/cli/utils/install.go b/pkg/cli/utils/install.go new file mode 100644 index 0000000..21ff11c --- /dev/null +++ b/pkg/cli/utils/install.go @@ -0,0 +1,97 @@ +package utils + +import ( + "fmt" + "strings" +) + +// IPFSPeerInfo holds IPFS peer information for configuring Peering.Peers +type IPFSPeerInfo struct { + PeerID string + Addrs []string +} + +// IPFSClusterPeerInfo contains IPFS Cluster peer information for cluster discovery +type IPFSClusterPeerInfo struct { + PeerID string + Addrs []string +} + +// ShowDryRunSummary displays what would be done during installation without making changes +func ShowDryRunSummary(vpsIP, domain, branch string, peers []string, joinAddress string, isFirstNode bool, oramaDir string) { + fmt.Print("\n" + strings.Repeat("=", 70) + "\n") + fmt.Printf("DRY RUN - No changes will be made\n") + fmt.Print(strings.Repeat("=", 70) + "\n\n") + + fmt.Printf("šŸ“‹ Installation Summary:\n") + fmt.Printf(" VPS IP: %s\n", vpsIP) + fmt.Printf(" Domain: %s\n", domain) + fmt.Printf(" Branch: %s\n", branch) + if isFirstNode { + fmt.Printf(" Node Type: First node (creates new cluster)\n") + } else { + fmt.Printf(" Node Type: Joining existing cluster\n") + if joinAddress != "" { + fmt.Printf(" Join Address: %s\n", joinAddress) + } + if len(peers) > 0 { + fmt.Printf(" Peers: %d peer(s)\n", len(peers)) + for _, peer := range peers { + fmt.Printf(" - %s\n", peer) + } + } + } + + fmt.Printf("\nšŸ“ Directories that would be created:\n") + fmt.Printf(" %s/configs/\n", oramaDir) + fmt.Printf(" %s/secrets/\n", oramaDir) + fmt.Printf(" %s/data/ipfs/repo/\n", oramaDir) + fmt.Printf(" %s/data/ipfs-cluster/\n", oramaDir) + fmt.Printf(" %s/data/rqlite/\n", oramaDir) + fmt.Printf(" %s/logs/\n", oramaDir) + fmt.Printf(" %s/tls-cache/\n", oramaDir) + + fmt.Printf("\nšŸ”§ Binaries that would be installed:\n") + fmt.Printf(" - Go (if not present)\n") + fmt.Printf(" - RQLite 8.43.0\n") + fmt.Printf(" - IPFS/Kubo 0.38.2\n") + fmt.Printf(" - IPFS Cluster (latest)\n") + fmt.Printf(" - Olric 0.7.0\n") + fmt.Printf(" - anyone-client (npm)\n") + fmt.Printf(" - DeBros binaries (built from %s branch)\n", branch) + + fmt.Printf("\nšŸ” Secrets that would be generated:\n") + fmt.Printf(" - Cluster secret (64-hex)\n") + fmt.Printf(" - IPFS swarm key\n") + fmt.Printf(" - Node identity (Ed25519 keypair)\n") + + fmt.Printf("\nšŸ“ Configuration files that would be created:\n") + fmt.Printf(" - %s/configs/node.yaml\n", oramaDir) + fmt.Printf(" - %s/configs/olric/config.yaml\n", oramaDir) + + fmt.Printf("\nāš™ļø Systemd services that would be created:\n") + fmt.Printf(" - debros-ipfs.service\n") + fmt.Printf(" - debros-ipfs-cluster.service\n") + fmt.Printf(" - debros-olric.service\n") + fmt.Printf(" - debros-node.service (includes embedded gateway + RQLite)\n") + fmt.Printf(" - debros-anyone-client.service\n") + + fmt.Printf("\n🌐 Ports that would be used:\n") + fmt.Printf(" External (must be open in firewall):\n") + fmt.Printf(" - 80 (HTTP for ACME/Let's Encrypt)\n") + fmt.Printf(" - 443 (HTTPS gateway)\n") + fmt.Printf(" - 4101 (IPFS swarm)\n") + fmt.Printf(" - 7001 (RQLite Raft)\n") + fmt.Printf(" Internal (localhost only):\n") + fmt.Printf(" - 4501 (IPFS API)\n") + fmt.Printf(" - 5001 (RQLite HTTP)\n") + fmt.Printf(" - 6001 (Unified gateway)\n") + fmt.Printf(" - 8080 (IPFS gateway)\n") + fmt.Printf(" - 9050 (Anyone SOCKS5)\n") + fmt.Printf(" - 9094 (IPFS Cluster API)\n") + fmt.Printf(" - 3320/3322 (Olric)\n") + + fmt.Print("\n" + strings.Repeat("=", 70) + "\n") + fmt.Printf("To proceed with installation, run without --dry-run\n") + fmt.Print(strings.Repeat("=", 70) + "\n\n") +} diff --git a/pkg/cli/utils/systemd.go b/pkg/cli/utils/systemd.go new file mode 100644 index 0000000..e73c40e --- /dev/null +++ b/pkg/cli/utils/systemd.go @@ -0,0 +1,217 @@ +package utils + +import ( + "errors" + "fmt" + "net" + "os" + "os/exec" + "path/filepath" + "strings" + "syscall" +) + +var ErrServiceNotFound = errors.New("service not found") + +// PortSpec defines a port and its name for checking availability +type PortSpec struct { + Name string + Port int +} + +var ServicePorts = map[string][]PortSpec{ + "debros-gateway": { + {Name: "Gateway API", Port: 6001}, + }, + "debros-olric": { + {Name: "Olric HTTP", Port: 3320}, + {Name: "Olric Memberlist", Port: 3322}, + }, + "debros-node": { + {Name: "RQLite HTTP", Port: 5001}, + {Name: "RQLite Raft", Port: 7001}, + }, + "debros-ipfs": { + {Name: "IPFS API", Port: 4501}, + {Name: "IPFS Gateway", Port: 8080}, + {Name: "IPFS Swarm", Port: 4101}, + }, + "debros-ipfs-cluster": { + {Name: "IPFS Cluster API", Port: 9094}, + }, +} + +// DefaultPorts is used for fresh installs/upgrades before unit files exist. +func DefaultPorts() []PortSpec { + return []PortSpec{ + {Name: "IPFS Swarm", Port: 4001}, + {Name: "IPFS API", Port: 4501}, + {Name: "IPFS Gateway", Port: 8080}, + {Name: "Gateway API", Port: 6001}, + {Name: "RQLite HTTP", Port: 5001}, + {Name: "RQLite Raft", Port: 7001}, + {Name: "IPFS Cluster API", Port: 9094}, + {Name: "Olric HTTP", Port: 3320}, + {Name: "Olric Memberlist", Port: 3322}, + } +} + +// ResolveServiceName resolves service aliases to actual systemd service names +func ResolveServiceName(alias string) ([]string, error) { + // Service alias mapping (unified - no bootstrap/node distinction) + aliases := map[string][]string{ + "node": {"debros-node"}, + "ipfs": {"debros-ipfs"}, + "cluster": {"debros-ipfs-cluster"}, + "ipfs-cluster": {"debros-ipfs-cluster"}, + "gateway": {"debros-gateway"}, + "olric": {"debros-olric"}, + "rqlite": {"debros-node"}, // RQLite logs are in node logs + } + + // Check if it's an alias + if serviceNames, ok := aliases[strings.ToLower(alias)]; ok { + // Filter to only existing services + var existing []string + for _, svc := range serviceNames { + unitPath := filepath.Join("/etc/systemd/system", svc+".service") + if _, err := os.Stat(unitPath); err == nil { + existing = append(existing, svc) + } + } + if len(existing) == 0 { + return nil, fmt.Errorf("no services found for alias %q", alias) + } + return existing, nil + } + + // Check if it's already a full service name + unitPath := filepath.Join("/etc/systemd/system", alias+".service") + if _, err := os.Stat(unitPath); err == nil { + return []string{alias}, nil + } + + // Try without .service suffix + if !strings.HasSuffix(alias, ".service") { + unitPath = filepath.Join("/etc/systemd/system", alias+".service") + if _, err := os.Stat(unitPath); err == nil { + return []string{alias}, nil + } + } + + return nil, fmt.Errorf("service %q not found. Use: node, ipfs, cluster, gateway, olric, or full service name", alias) +} + +// IsServiceActive checks if a systemd service is currently active (running) +func IsServiceActive(service string) (bool, error) { + cmd := exec.Command("systemctl", "is-active", "--quiet", service) + if err := cmd.Run(); err != nil { + if exitErr, ok := err.(*exec.ExitError); ok { + switch exitErr.ExitCode() { + case 3: + return false, nil + case 4: + return false, ErrServiceNotFound + } + } + return false, err + } + return true, nil +} + +// IsServiceEnabled checks if a systemd service is enabled to start on boot +func IsServiceEnabled(service string) (bool, error) { + cmd := exec.Command("systemctl", "is-enabled", "--quiet", service) + if err := cmd.Run(); err != nil { + if exitErr, ok := err.(*exec.ExitError); ok { + switch exitErr.ExitCode() { + case 1: + return false, nil // Service is disabled + case 4: + return false, ErrServiceNotFound + } + } + return false, err + } + return true, nil +} + +// IsServiceMasked checks if a systemd service is masked +func IsServiceMasked(service string) (bool, error) { + cmd := exec.Command("systemctl", "is-enabled", service) + output, err := cmd.CombinedOutput() + if err != nil { + outputStr := string(output) + if strings.Contains(outputStr, "masked") { + return true, nil + } + return false, err + } + return false, nil +} + +// GetProductionServices returns a list of all DeBros production service names that exist +func GetProductionServices() []string { + // Unified service names (no bootstrap/node distinction) + allServices := []string{ + "debros-gateway", + "debros-node", + "debros-olric", + "debros-ipfs-cluster", + "debros-ipfs", + "debros-anyone-client", + } + + // Filter to only existing services by checking if unit file exists + var existing []string + for _, svc := range allServices { + unitPath := filepath.Join("/etc/systemd/system", svc+".service") + if _, err := os.Stat(unitPath); err == nil { + existing = append(existing, svc) + } + } + + return existing +} + +// CollectPortsForServices returns a list of ports used by the specified services +func CollectPortsForServices(services []string, skipActive bool) ([]PortSpec, error) { + seen := make(map[int]PortSpec) + for _, svc := range services { + if skipActive { + active, err := IsServiceActive(svc) + if err != nil { + return nil, fmt.Errorf("unable to check %s: %w", svc, err) + } + if active { + continue + } + } + for _, spec := range ServicePorts[svc] { + if _, ok := seen[spec.Port]; !ok { + seen[spec.Port] = spec + } + } + } + ports := make([]PortSpec, 0, len(seen)) + for _, spec := range seen { + ports = append(ports, spec) + } + return ports, nil +} + +// EnsurePortsAvailable checks if the specified ports are available +func EnsurePortsAvailable(action string, ports []PortSpec) error { + for _, spec := range ports { + ln, err := net.Listen("tcp", fmt.Sprintf("0.0.0.0:%d", spec.Port)) + if err != nil { + if errors.Is(err, syscall.EADDRINUSE) || strings.Contains(err.Error(), "address already in use") { + return fmt.Errorf("%s cannot continue: %s (port %d) is already in use", action, spec.Name, spec.Port) + } + return fmt.Errorf("%s cannot continue: failed to inspect %s (port %d): %w", action, spec.Name, spec.Port, err) + } + _ = ln.Close() + } + return nil +} + diff --git a/pkg/cli/utils/validation.go b/pkg/cli/utils/validation.go new file mode 100644 index 0000000..ce42a4f --- /dev/null +++ b/pkg/cli/utils/validation.go @@ -0,0 +1,113 @@ +package utils + +import ( + "fmt" + "net" + "os" + "path/filepath" + "strings" + + "github.com/DeBrosOfficial/network/pkg/config" + "github.com/multiformats/go-multiaddr" +) + +// ValidateGeneratedConfig loads and validates the generated node configuration +func ValidateGeneratedConfig(oramaDir string) error { + configPath := filepath.Join(oramaDir, "configs", "node.yaml") + + // Check if config file exists + if _, err := os.Stat(configPath); os.IsNotExist(err) { + return fmt.Errorf("configuration file not found at %s", configPath) + } + + // Load the config file + file, err := os.Open(configPath) + if err != nil { + return fmt.Errorf("failed to open config file: %w", err) + } + defer file.Close() + + var cfg config.Config + if err := config.DecodeStrict(file, &cfg); err != nil { + return fmt.Errorf("failed to parse config: %w", err) + } + + // Validate the configuration + if errs := cfg.Validate(); len(errs) > 0 { + var errMsgs []string + for _, e := range errs { + errMsgs = append(errMsgs, e.Error()) + } + return fmt.Errorf("configuration validation errors:\n - %s", strings.Join(errMsgs, "\n - ")) + } + + return nil +} + +// ValidateDNSRecord validates that the domain points to the expected IP address +// Returns nil if DNS is valid, warning message if DNS doesn't match but continues, +// or error if DNS lookup fails completely +func ValidateDNSRecord(domain, expectedIP string) error { + if domain == "" { + return nil // No domain provided, skip validation + } + + ips, err := net.LookupIP(domain) + if err != nil { + // DNS lookup failed - this is a warning, not a fatal error + // The user might be setting up DNS after installation + fmt.Printf(" āš ļø DNS lookup failed for %s: %v\n", domain, err) + fmt.Printf(" Make sure DNS is configured before enabling HTTPS\n") + return nil + } + + // Check if any resolved IP matches the expected IP + for _, ip := range ips { + if ip.String() == expectedIP { + fmt.Printf(" āœ“ DNS validated: %s → %s\n", domain, expectedIP) + return nil + } + } + + // DNS doesn't point to expected IP - warn but continue + resolvedIPs := make([]string, len(ips)) + for i, ip := range ips { + resolvedIPs[i] = ip.String() + } + fmt.Printf(" āš ļø DNS mismatch: %s resolves to %v, expected %s\n", domain, resolvedIPs, expectedIP) + fmt.Printf(" HTTPS certificate generation may fail until DNS is updated\n") + return nil +} + +// NormalizePeers normalizes and validates peer multiaddrs +func NormalizePeers(peersStr string) ([]string, error) { + if peersStr == "" { + return nil, nil + } + + // Split by comma and trim whitespace + rawPeers := strings.Split(peersStr, ",") + peers := make([]string, 0, len(rawPeers)) + seen := make(map[string]bool) + + for _, peer := range rawPeers { + peer = strings.TrimSpace(peer) + if peer == "" { + continue + } + + // Validate multiaddr format + if _, err := multiaddr.NewMultiaddr(peer); err != nil { + return nil, fmt.Errorf("invalid multiaddr %q: %w", peer, err) + } + + // Deduplicate + if !seen[peer] { + peers = append(peers, peer) + seen[peer] = true + } + } + + return peers, nil +} + diff --git a/pkg/config/validate.go b/pkg/config/validate.go index d07e67d..21d9249 100644 --- a/pkg/config/validate.go +++ b/pkg/config/validate.go @@ -1,6 +1,7 @@ package config import ( + "encoding/hex" "fmt" "net" "os" @@ -585,3 +586,15 @@ func extractTCPPort(multiaddrStr string) string { } return "" } + +// ValidateSwarmKey validates that a swarm key is 64 hex characters +func ValidateSwarmKey(key string) error { + key = strings.TrimSpace(key) + if len(key) != 64 { + return fmt.Errorf("swarm key must be 64 hex characters (32 bytes), got %d", len(key)) + } + if _, err := hex.DecodeString(key); err != nil { + return fmt.Errorf("swarm key must be valid hexadecimal: %w", err) + } + return nil +} diff --git a/pkg/environments/development/ipfs.go b/pkg/environments/development/ipfs.go new file mode 100644 index 0000000..a6ba3d9 --- /dev/null +++ b/pkg/environments/development/ipfs.go @@ -0,0 +1,287 @@ +package development + +import ( + "context" + "encoding/json" + "fmt" + "net/http" + "net/url" + "os" + "os/exec" + "path/filepath" + "strings" + "time" + + "github.com/DeBrosOfficial/network/pkg/tlsutil" +) + +// ipfsNodeInfo holds information about an IPFS node for peer discovery +type ipfsNodeInfo struct { + name string + ipfsPath string + apiPort int + swarmPort int + gatewayPort int + peerID string +} + +func (pm *ProcessManager) buildIPFSNodes(topology *Topology) []ipfsNodeInfo { + var nodes []ipfsNodeInfo + for _, nodeSpec := range topology.Nodes { + nodes = append(nodes, ipfsNodeInfo{ + name: nodeSpec.Name, + ipfsPath: filepath.Join(pm.oramaDir, nodeSpec.DataDir, "ipfs/repo"), + apiPort: nodeSpec.IPFSAPIPort, + swarmPort: nodeSpec.IPFSSwarmPort, + gatewayPort: nodeSpec.IPFSGatewayPort, + peerID: "", + }) + } + return nodes +} + +func (pm *ProcessManager) startIPFS(ctx context.Context) error { + topology := DefaultTopology() + nodes := pm.buildIPFSNodes(topology) + + for i := range nodes { + os.MkdirAll(nodes[i].ipfsPath, 0755) + + if _, err := os.Stat(filepath.Join(nodes[i].ipfsPath, "config")); os.IsNotExist(err) { + fmt.Fprintf(pm.logWriter, " Initializing IPFS (%s)...\n", nodes[i].name) + cmd := exec.CommandContext(ctx, "ipfs", "init", "--profile=server", "--repo-dir="+nodes[i].ipfsPath) + if _, err := cmd.CombinedOutput(); err != nil { + fmt.Fprintf(pm.logWriter, " Warning: ipfs init failed: %v\n", err) + } + + swarmKeyPath := filepath.Join(pm.oramaDir, "swarm.key") + if data, err := os.ReadFile(swarmKeyPath); err == nil { + os.WriteFile(filepath.Join(nodes[i].ipfsPath, "swarm.key"), data, 0600) + } + } + + peerID, err := configureIPFSRepo(nodes[i].ipfsPath, nodes[i].apiPort, nodes[i].gatewayPort, nodes[i].swarmPort) + if err != nil { + fmt.Fprintf(pm.logWriter, " Warning: failed to configure IPFS repo for %s: %v\n", nodes[i].name, err) + } else { + nodes[i].peerID = peerID + fmt.Fprintf(pm.logWriter, " Peer ID for %s: %s\n", nodes[i].name, peerID) + } + } + + for i := range nodes { + pidPath := filepath.Join(pm.pidsDir, fmt.Sprintf("ipfs-%s.pid", nodes[i].name)) + logPath := filepath.Join(pm.oramaDir, "logs", fmt.Sprintf("ipfs-%s.log", nodes[i].name)) + + cmd := exec.CommandContext(ctx, "ipfs", "daemon", "--enable-pubsub-experiment", "--repo-dir="+nodes[i].ipfsPath) + logFile, _ := os.Create(logPath) + cmd.Stdout = logFile + cmd.Stderr = logFile + + if err := cmd.Start(); err != nil { + return fmt.Errorf("failed to start ipfs-%s: %w", nodes[i].name, err) + } + + os.WriteFile(pidPath, []byte(fmt.Sprintf("%d", cmd.Process.Pid)), 0644) + pm.processes[fmt.Sprintf("ipfs-%s", nodes[i].name)] = &ManagedProcess{ + Name: fmt.Sprintf("ipfs-%s", nodes[i].name), + PID: cmd.Process.Pid, + StartTime: time.Now(), + LogPath: logPath, + } + + fmt.Fprintf(pm.logWriter, "āœ“ IPFS (%s) started (PID: %d, API: %d, Swarm: %d)\n", nodes[i].name, cmd.Process.Pid, nodes[i].apiPort, nodes[i].swarmPort) + } + + time.Sleep(2 * time.Second) + + if err := pm.seedIPFSPeersWithHTTP(ctx, nodes); err != nil { + fmt.Fprintf(pm.logWriter, "āš ļø Failed to seed IPFS peers: %v\n", err) + } + + return nil +} + +func configureIPFSRepo(repoPath string, apiPort, gatewayPort, swarmPort int) (string, error) { + configPath := filepath.Join(repoPath, "config") + data, err := os.ReadFile(configPath) + if err != nil { + return "", fmt.Errorf("failed to read IPFS config: %w", err) + } + + var config map[string]interface{} + if err := json.Unmarshal(data, &config); err != nil { + return "", fmt.Errorf("failed to parse IPFS config: %w", err) + } + + config["Addresses"] = map[string]interface{}{ + "API": []string{fmt.Sprintf("/ip4/127.0.0.1/tcp/%d", apiPort)}, + "Gateway": []string{fmt.Sprintf("/ip4/127.0.0.1/tcp/%d", gatewayPort)}, + "Swarm": []string{ + fmt.Sprintf("/ip4/0.0.0.0/tcp/%d", swarmPort), + fmt.Sprintf("/ip6/::/tcp/%d", swarmPort), + }, + } + + config["AutoConf"] = map[string]interface{}{ + "Enabled": false, + } + config["Bootstrap"] = []string{} + + if dns, ok := config["DNS"].(map[string]interface{}); ok { + dns["Resolvers"] = map[string]interface{}{} + } else { + config["DNS"] = map[string]interface{}{ + "Resolvers": map[string]interface{}{}, + } + } + + if routing, ok := config["Routing"].(map[string]interface{}); ok { + routing["DelegatedRouters"] = []string{} + } else { + config["Routing"] = map[string]interface{}{ + "DelegatedRouters": []string{}, + } + } + + if ipns, ok := config["Ipns"].(map[string]interface{}); ok { + ipns["DelegatedPublishers"] = []string{} + } else { + config["Ipns"] = map[string]interface{}{ + "DelegatedPublishers": []string{}, + } + } + + if api, ok := config["API"].(map[string]interface{}); ok { + api["HTTPHeaders"] = map[string][]string{ + "Access-Control-Allow-Origin": {"*"}, + "Access-Control-Allow-Methods": {"GET", "PUT", "POST", "DELETE", "OPTIONS"}, + "Access-Control-Allow-Headers": {"Content-Type", "X-Requested-With"}, + "Access-Control-Expose-Headers": {"Content-Length", "Content-Range"}, + } + } else { + config["API"] = map[string]interface{}{ + "HTTPHeaders": map[string][]string{ + "Access-Control-Allow-Origin": {"*"}, + "Access-Control-Allow-Methods": {"GET", "PUT", "POST", "DELETE", "OPTIONS"}, + "Access-Control-Allow-Headers": {"Content-Type", "X-Requested-With"}, + "Access-Control-Expose-Headers": {"Content-Length", "Content-Range"}, + }, + } + } + + updatedData, err := json.MarshalIndent(config, "", " ") + if err != nil { + return "", fmt.Errorf("failed to marshal IPFS config: %w", err) + } + + if err := os.WriteFile(configPath, updatedData, 0644); err != nil { + return "", fmt.Errorf("failed to write IPFS config: %w", err) + } + + if id, ok := config["Identity"].(map[string]interface{}); ok { + if peerID, ok := id["PeerID"].(string); ok { + return peerID, nil + } + } + + return "", fmt.Errorf("could not extract peer ID from config") +} + +func (pm *ProcessManager) seedIPFSPeersWithHTTP(ctx context.Context, nodes []ipfsNodeInfo) error { + fmt.Fprintf(pm.logWriter, " Seeding IPFS local bootstrap peers via HTTP API...\n") + + for _, node := range nodes { + if err := pm.waitIPFSReady(ctx, node); err != nil { + fmt.Fprintf(pm.logWriter, " Warning: failed to wait for IPFS readiness for %s: %v\n", node.name, err) + } + } + + for i, node := range nodes { + httpURL := fmt.Sprintf("http://127.0.0.1:%d/api/v0/bootstrap/rm?all=true", node.apiPort) + if err := pm.ipfsHTTPCall(ctx, httpURL, "POST"); err != nil { + fmt.Fprintf(pm.logWriter, " Warning: failed to clear bootstrap for %s: %v\n", node.name, err) + } + + for j, otherNode := range nodes { + if i == j { + continue + } + + multiaddr := fmt.Sprintf("/ip4/127.0.0.1/tcp/%d/p2p/%s", otherNode.swarmPort, otherNode.peerID) + httpURL := fmt.Sprintf("http://127.0.0.1:%d/api/v0/bootstrap/add?arg=%s", node.apiPort, url.QueryEscape(multiaddr)) + if err := pm.ipfsHTTPCall(ctx, httpURL, "POST"); err != nil { + fmt.Fprintf(pm.logWriter, " Warning: failed to add bootstrap peer for %s: %v\n", node.name, err) + } + } + } + + return nil +} + +func (pm *ProcessManager) waitIPFSReady(ctx context.Context, node ipfsNodeInfo) error { + maxRetries := 30 + retryInterval := 500 * time.Millisecond + + for attempt := 0; attempt < maxRetries; attempt++ { + httpURL := fmt.Sprintf("http://127.0.0.1:%d/api/v0/version", node.apiPort) + if err := pm.ipfsHTTPCall(ctx, httpURL, "POST"); err == nil { + return nil + } + + select { + case <-time.After(retryInterval): + continue + case <-ctx.Done(): + return ctx.Err() + } + } + + return fmt.Errorf("IPFS daemon %s did not become ready", node.name) +} + +func (pm *ProcessManager) ipfsHTTPCall(ctx context.Context, urlStr string, method string) error { + client := tlsutil.NewHTTPClient(5 * time.Second) + req, err := http.NewRequestWithContext(ctx, method, urlStr, nil) + if err != nil { + return fmt.Errorf("failed to create request: %w", err) + } + + resp, err := client.Do(req) + if err != nil { + return fmt.Errorf("HTTP call failed: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode >= 400 { + return fmt.Errorf("HTTP %d", resp.StatusCode) + } + + return nil +} + +func readIPFSConfigValue(ctx context.Context, repoPath string, key string) (string, error) { + configPath := filepath.Join(repoPath, "config") + data, err := os.ReadFile(configPath) + if err != nil { + return "", fmt.Errorf("failed to read IPFS config: %w", err) + } + + lines := strings.Split(string(data), "\n") + for _, line := range lines { + line = strings.TrimSpace(line) + if strings.Contains(line, key) { + parts := strings.SplitN(line, ":", 2) + if len(parts) == 2 { + value := strings.TrimSpace(parts[1]) + value = strings.Trim(value, `",`) + if value != "" { + return value, nil + } + } + } + } + + return "", fmt.Errorf("key %s not found in IPFS config", key) +} + diff --git a/pkg/environments/development/ipfs_cluster.go b/pkg/environments/development/ipfs_cluster.go new file mode 100644 index 0000000..b968348 --- /dev/null +++ b/pkg/environments/development/ipfs_cluster.go @@ -0,0 +1,314 @@ +package development + +import ( + "context" + "encoding/json" + "fmt" + "net/http" + "os" + "os/exec" + "path/filepath" + "strings" + "time" +) + +func (pm *ProcessManager) startIPFSCluster(ctx context.Context) error { + topology := DefaultTopology() + var nodes []struct { + name string + clusterPath string + restAPIPort int + clusterPort int + ipfsPort int + } + + for _, nodeSpec := range topology.Nodes { + nodes = append(nodes, struct { + name string + clusterPath string + restAPIPort int + clusterPort int + ipfsPort int + }{ + nodeSpec.Name, + filepath.Join(pm.oramaDir, nodeSpec.DataDir, "ipfs-cluster"), + nodeSpec.ClusterAPIPort, + nodeSpec.ClusterPort, + nodeSpec.IPFSAPIPort, + }) + } + + fmt.Fprintf(pm.logWriter, " Waiting for IPFS daemons to be ready...\n") + ipfsNodes := pm.buildIPFSNodes(topology) + for _, ipfsNode := range ipfsNodes { + if err := pm.waitIPFSReady(ctx, ipfsNode); err != nil { + fmt.Fprintf(pm.logWriter, " Warning: IPFS %s did not become ready: %v\n", ipfsNode.name, err) + } + } + + secretPath := filepath.Join(pm.oramaDir, "cluster-secret") + clusterSecret, err := os.ReadFile(secretPath) + if err != nil { + return fmt.Errorf("failed to read cluster secret: %w", err) + } + clusterSecretHex := strings.TrimSpace(string(clusterSecret)) + + bootstrapMultiaddr := "" + { + node := nodes[0] + if err := pm.cleanClusterState(node.clusterPath); err != nil { + fmt.Fprintf(pm.logWriter, " Warning: failed to clean cluster state for %s: %v\n", node.name, err) + } + + os.MkdirAll(node.clusterPath, 0755) + fmt.Fprintf(pm.logWriter, " Initializing IPFS Cluster (%s)...\n", node.name) + cmd := exec.CommandContext(ctx, "ipfs-cluster-service", "init", "--force") + cmd.Env = append(os.Environ(), + fmt.Sprintf("IPFS_CLUSTER_PATH=%s", node.clusterPath), + fmt.Sprintf("CLUSTER_SECRET=%s", clusterSecretHex), + ) + if output, err := cmd.CombinedOutput(); err != nil { + fmt.Fprintf(pm.logWriter, " Warning: ipfs-cluster-service init failed: %v (output: %s)\n", err, string(output)) + } + + if err := pm.ensureIPFSClusterPorts(node.clusterPath, node.restAPIPort, node.clusterPort); err != nil { + fmt.Fprintf(pm.logWriter, " Warning: failed to update IPFS Cluster config for %s: %v\n", node.name, err) + } + + pidPath := filepath.Join(pm.pidsDir, fmt.Sprintf("ipfs-cluster-%s.pid", node.name)) + logPath := filepath.Join(pm.oramaDir, "logs", fmt.Sprintf("ipfs-cluster-%s.log", node.name)) + + cmd = exec.CommandContext(ctx, "ipfs-cluster-service", "daemon") + cmd.Env = append(os.Environ(), fmt.Sprintf("IPFS_CLUSTER_PATH=%s", node.clusterPath)) + logFile, _ := os.Create(logPath) + cmd.Stdout = logFile + cmd.Stderr = logFile + + if err := cmd.Start(); err != nil { + return err + } + + os.WriteFile(pidPath, []byte(fmt.Sprintf("%d", cmd.Process.Pid)), 0644) + fmt.Fprintf(pm.logWriter, "āœ“ IPFS Cluster (%s) started (PID: %d, API: %d)\n", node.name, cmd.Process.Pid, node.restAPIPort) + + if err := pm.waitClusterReady(ctx, node.name, node.restAPIPort); err != nil { + fmt.Fprintf(pm.logWriter, " Warning: IPFS Cluster %s did not become ready: %v\n", node.name, err) + } + + time.Sleep(500 * time.Millisecond) + + peerID, err := pm.waitForClusterPeerID(ctx, filepath.Join(node.clusterPath, "identity.json")) + if err != nil { + fmt.Fprintf(pm.logWriter, " Warning: failed to read bootstrap peer ID: %v\n", err) + } else { + bootstrapMultiaddr = fmt.Sprintf("/ip4/127.0.0.1/tcp/%d/p2p/%s", node.clusterPort, peerID) + } + } + + for i := 1; i < len(nodes); i++ { + node := nodes[i] + if err := pm.cleanClusterState(node.clusterPath); err != nil { + fmt.Fprintf(pm.logWriter, " Warning: failed to clean cluster state for %s: %v\n", node.name, err) + } + + os.MkdirAll(node.clusterPath, 0755) + fmt.Fprintf(pm.logWriter, " Initializing IPFS Cluster (%s)...\n", node.name) + cmd := exec.CommandContext(ctx, "ipfs-cluster-service", "init", "--force") + cmd.Env = append(os.Environ(), + fmt.Sprintf("IPFS_CLUSTER_PATH=%s", node.clusterPath), + fmt.Sprintf("CLUSTER_SECRET=%s", clusterSecretHex), + ) + if output, err := cmd.CombinedOutput(); err != nil { + fmt.Fprintf(pm.logWriter, " Warning: ipfs-cluster-service init failed for %s: %v (output: %s)\n", node.name, err, string(output)) + } + + if err := pm.ensureIPFSClusterPorts(node.clusterPath, node.restAPIPort, node.clusterPort); err != nil { + fmt.Fprintf(pm.logWriter, " Warning: failed to update IPFS Cluster config for %s: %v\n", node.name, err) + } + + pidPath := filepath.Join(pm.pidsDir, fmt.Sprintf("ipfs-cluster-%s.pid", node.name)) + logPath := filepath.Join(pm.oramaDir, "logs", fmt.Sprintf("ipfs-cluster-%s.log", node.name)) + + args := []string{"daemon"} + if bootstrapMultiaddr != "" { + args = append(args, "--bootstrap", bootstrapMultiaddr) + } + + cmd = exec.CommandContext(ctx, "ipfs-cluster-service", args...) + cmd.Env = append(os.Environ(), fmt.Sprintf("IPFS_CLUSTER_PATH=%s", node.clusterPath)) + logFile, _ := os.Create(logPath) + cmd.Stdout = logFile + cmd.Stderr = logFile + + if err := cmd.Start(); err != nil { + continue + } + + os.WriteFile(pidPath, []byte(fmt.Sprintf("%d", cmd.Process.Pid)), 0644) + fmt.Fprintf(pm.logWriter, "āœ“ IPFS Cluster (%s) started (PID: %d, API: %d)\n", node.name, cmd.Process.Pid, node.restAPIPort) + + if err := pm.waitClusterReady(ctx, node.name, node.restAPIPort); err != nil { + fmt.Fprintf(pm.logWriter, " Warning: IPFS Cluster %s did not become ready: %v\n", node.name, err) + } + } + + fmt.Fprintf(pm.logWriter, " Waiting for IPFS Cluster peers to form...\n") + if err := pm.waitClusterFormed(ctx, nodes[0].restAPIPort); err != nil { + fmt.Fprintf(pm.logWriter, " Warning: IPFS Cluster did not form fully: %v\n", err) + } + + time.Sleep(1 * time.Second) + return nil +} + +func (pm *ProcessManager) waitForClusterPeerID(ctx context.Context, identityPath string) (string, error) { + maxRetries := 30 + retryInterval := 500 * time.Millisecond + + for attempt := 0; attempt < maxRetries; attempt++ { + data, err := os.ReadFile(identityPath) + if err == nil { + var identity map[string]interface{} + if err := json.Unmarshal(data, &identity); err == nil { + if id, ok := identity["id"].(string); ok { + return id, nil + } + } + } + + select { + case <-time.After(retryInterval): + continue + case <-ctx.Done(): + return "", ctx.Err() + } + } + + return "", fmt.Errorf("could not read cluster peer ID") +} + +func (pm *ProcessManager) waitClusterReady(ctx context.Context, name string, restAPIPort int) error { + maxRetries := 30 + retryInterval := 500 * time.Millisecond + + for attempt := 0; attempt < maxRetries; attempt++ { + httpURL := fmt.Sprintf("http://127.0.0.1:%d/peers", restAPIPort) + resp, err := http.Get(httpURL) + if err == nil && resp.StatusCode == 200 { + resp.Body.Close() + return nil + } + if resp != nil { + resp.Body.Close() + } + + select { + case <-time.After(retryInterval): + continue + case <-ctx.Done(): + return ctx.Err() + } + } + + return fmt.Errorf("IPFS Cluster %s did not become ready", name) +} + +func (pm *ProcessManager) waitClusterFormed(ctx context.Context, bootstrapRestAPIPort int) error { + maxRetries := 30 + retryInterval := 1 * time.Second + requiredPeers := 3 + + for attempt := 0; attempt < maxRetries; attempt++ { + httpURL := fmt.Sprintf("http://127.0.0.1:%d/peers", bootstrapRestAPIPort) + resp, err := http.Get(httpURL) + if err == nil && resp.StatusCode == 200 { + dec := json.NewDecoder(resp.Body) + peerCount := 0 + for { + var peer interface{} + if err := dec.Decode(&peer); err != nil { + break + } + peerCount++ + } + resp.Body.Close() + if peerCount >= requiredPeers { + return nil + } + } + if resp != nil { + resp.Body.Close() + } + + select { + case <-time.After(retryInterval): + continue + case <-ctx.Done(): + return ctx.Err() + } + } + + return fmt.Errorf("IPFS Cluster did not form fully") +} + +func (pm *ProcessManager) cleanClusterState(clusterPath string) error { + pebblePath := filepath.Join(clusterPath, "pebble") + os.RemoveAll(pebblePath) + + peerstorePath := filepath.Join(clusterPath, "peerstore") + os.Remove(peerstorePath) + + serviceJSONPath := filepath.Join(clusterPath, "service.json") + os.Remove(serviceJSONPath) + + lockPath := filepath.Join(clusterPath, "cluster.lock") + os.Remove(lockPath) + + return nil +} + +func (pm *ProcessManager) ensureIPFSClusterPorts(clusterPath string, restAPIPort int, clusterPort int) error { + serviceJSONPath := filepath.Join(clusterPath, "service.json") + data, err := os.ReadFile(serviceJSONPath) + if err != nil { + return err + } + + var config map[string]interface{} + json.Unmarshal(data, &config) + + portOffset := restAPIPort - 9094 + proxyPort := 9095 + portOffset + pinsvcPort := 9097 + portOffset + ipfsPort := 4501 + (portOffset / 10) + + if api, ok := config["api"].(map[string]interface{}); ok { + if restapi, ok := api["restapi"].(map[string]interface{}); ok { + restapi["http_listen_multiaddress"] = fmt.Sprintf("/ip4/0.0.0.0/tcp/%d", restAPIPort) + } + if proxy, ok := api["ipfsproxy"].(map[string]interface{}); ok { + proxy["listen_multiaddress"] = fmt.Sprintf("/ip4/127.0.0.1/tcp/%d", proxyPort) + proxy["node_multiaddress"] = fmt.Sprintf("/ip4/127.0.0.1/tcp/%d", ipfsPort) + } + if pinsvc, ok := api["pinsvcapi"].(map[string]interface{}); ok { + pinsvc["http_listen_multiaddress"] = fmt.Sprintf("/ip4/127.0.0.1/tcp/%d", pinsvcPort) + } + } + + if cluster, ok := config["cluster"].(map[string]interface{}); ok { + cluster["listen_multiaddress"] = []string{ + fmt.Sprintf("/ip4/0.0.0.0/tcp/%d", clusterPort), + fmt.Sprintf("/ip4/127.0.0.1/tcp/%d", clusterPort), + } + } + + if connector, ok := config["ipfs_connector"].(map[string]interface{}); ok { + if ipfshttp, ok := connector["ipfshttp"].(map[string]interface{}); ok { + ipfshttp["node_multiaddress"] = fmt.Sprintf("/ip4/127.0.0.1/tcp/%d", ipfsPort) + } + } + + updatedData, _ := json.MarshalIndent(config, "", " ") + return os.WriteFile(serviceJSONPath, updatedData, 0644) +} + diff --git a/pkg/environments/development/process.go b/pkg/environments/development/process.go new file mode 100644 index 0000000..55d6ee1 --- /dev/null +++ b/pkg/environments/development/process.go @@ -0,0 +1,206 @@ +package development + +import ( + "context" + "fmt" + "os" + "os/exec" + "path/filepath" + "runtime" + "strconv" + "strings" + "time" +) + +func (pm *ProcessManager) printStartupSummary(topology *Topology) { + fmt.Fprintf(pm.logWriter, "\nāœ… Development environment ready!\n") + fmt.Fprintf(pm.logWriter, "═══════════════════════════════════════\n\n") + + fmt.Fprintf(pm.logWriter, "šŸ“” Access your nodes via unified gateway ports:\n\n") + for _, node := range topology.Nodes { + fmt.Fprintf(pm.logWriter, " %s:\n", node.Name) + fmt.Fprintf(pm.logWriter, " curl http://localhost:%d/health\n", node.UnifiedGatewayPort) + fmt.Fprintf(pm.logWriter, " curl http://localhost:%d/rqlite/http/db/execute\n", node.UnifiedGatewayPort) + fmt.Fprintf(pm.logWriter, " curl http://localhost:%d/cluster/health\n\n", node.UnifiedGatewayPort) + } + + fmt.Fprintf(pm.logWriter, "🌐 Main Gateway:\n") + fmt.Fprintf(pm.logWriter, " curl http://localhost:%d/v1/status\n\n", topology.GatewayPort) + + fmt.Fprintf(pm.logWriter, "šŸ“Š Other Services:\n") + fmt.Fprintf(pm.logWriter, " Olric: http://localhost:%d\n", topology.OlricHTTPPort) + fmt.Fprintf(pm.logWriter, " Anon SOCKS: 127.0.0.1:%d\n\n", topology.AnonSOCKSPort) + + fmt.Fprintf(pm.logWriter, "šŸ“ Useful Commands:\n") + fmt.Fprintf(pm.logWriter, " ./bin/orama dev status - Check service status\n") + fmt.Fprintf(pm.logWriter, " ./bin/orama dev logs node-1 - View logs\n") + fmt.Fprintf(pm.logWriter, " ./bin/orama dev down - Stop all services\n\n") + + fmt.Fprintf(pm.logWriter, "šŸ“‚ Logs: %s/logs\n", pm.oramaDir) + fmt.Fprintf(pm.logWriter, "āš™ļø Config: %s\n\n", pm.oramaDir) +} + +func (pm *ProcessManager) stopProcess(name string) error { + pidPath := filepath.Join(pm.pidsDir, fmt.Sprintf("%s.pid", name)) + pidBytes, err := os.ReadFile(pidPath) + if err != nil { + return nil + } + + pid, err := strconv.Atoi(strings.TrimSpace(string(pidBytes))) + if err != nil { + os.Remove(pidPath) + return nil + } + + if !checkProcessRunning(pid) { + os.Remove(pidPath) + fmt.Fprintf(pm.logWriter, "āœ“ %s (not running)\n", name) + return nil + } + + proc, err := os.FindProcess(pid) + if err != nil { + os.Remove(pidPath) + return nil + } + + proc.Signal(os.Interrupt) + + gracefulShutdown := false + for i := 0; i < 20; i++ { + time.Sleep(100 * time.Millisecond) + if !checkProcessRunning(pid) { + gracefulShutdown = true + break + } + } + + if !gracefulShutdown && checkProcessRunning(pid) { + proc.Signal(os.Kill) + time.Sleep(200 * time.Millisecond) + + if runtime.GOOS != "windows" { + exec.Command("pkill", "-9", "-P", fmt.Sprintf("%d", pid)).Run() + } + + if checkProcessRunning(pid) { + exec.Command("kill", "-9", fmt.Sprintf("%d", pid)).Run() + time.Sleep(100 * time.Millisecond) + } + } + + os.Remove(pidPath) + + if gracefulShutdown { + fmt.Fprintf(pm.logWriter, "āœ“ %s stopped gracefully\n", name) + } else { + fmt.Fprintf(pm.logWriter, "āœ“ %s stopped (forced)\n", name) + } + return nil +} + +func checkProcessRunning(pid int) bool { + proc, err := os.FindProcess(pid) + if err != nil { + return false + } + err = proc.Signal(os.Signal(nil)) + return err == nil +} + +func (pm *ProcessManager) startNode(name, configFile, logPath string) error { + pidPath := filepath.Join(pm.pidsDir, fmt.Sprintf("%s.pid", name)) + cmd := exec.Command("./bin/orama-node", "--config", configFile) + logFile, _ := os.Create(logPath) + cmd.Stdout = logFile + cmd.Stderr = logFile + + if err := cmd.Start(); err != nil { + return fmt.Errorf("failed to start %s: %w", name, err) + } + + os.WriteFile(pidPath, []byte(fmt.Sprintf("%d", cmd.Process.Pid)), 0644) + fmt.Fprintf(pm.logWriter, "āœ“ %s started (PID: %d)\n", strings.Title(name), cmd.Process.Pid) + + time.Sleep(1 * time.Second) + return nil +} + +func (pm *ProcessManager) startGateway(ctx context.Context) error { + pidPath := filepath.Join(pm.pidsDir, "gateway.pid") + logPath := filepath.Join(pm.oramaDir, "logs", "gateway.log") + + cmd := exec.Command("./bin/gateway", "--config", "gateway.yaml") + logFile, _ := os.Create(logPath) + cmd.Stdout = logFile + cmd.Stderr = logFile + + if err := cmd.Start(); err != nil { + return fmt.Errorf("failed to start gateway: %w", err) + } + + os.WriteFile(pidPath, []byte(fmt.Sprintf("%d", cmd.Process.Pid)), 0644) + fmt.Fprintf(pm.logWriter, "āœ“ Gateway started (PID: %d, listen: 6001)\n", cmd.Process.Pid) + + return nil +} + +func (pm *ProcessManager) startOlric(ctx context.Context) error { + pidPath := filepath.Join(pm.pidsDir, "olric.pid") + logPath := filepath.Join(pm.oramaDir, "logs", "olric.log") + configPath := filepath.Join(pm.oramaDir, "olric-config.yaml") + + cmd := exec.CommandContext(ctx, "olric-server") + cmd.Env = append(os.Environ(), fmt.Sprintf("OLRIC_SERVER_CONFIG=%s", configPath)) + logFile, _ := os.Create(logPath) + cmd.Stdout = logFile + cmd.Stderr = logFile + + if err := cmd.Start(); err != nil { + return fmt.Errorf("failed to start olric: %w", err) + } + + os.WriteFile(pidPath, []byte(fmt.Sprintf("%d", cmd.Process.Pid)), 0644) + fmt.Fprintf(pm.logWriter, "āœ“ Olric started (PID: %d)\n", cmd.Process.Pid) + + time.Sleep(1 * time.Second) + return nil +} + +func (pm *ProcessManager) startAnon(ctx context.Context) error { + if runtime.GOOS != "darwin" { + return nil + } + + pidPath := filepath.Join(pm.pidsDir, "anon.pid") + logPath := filepath.Join(pm.oramaDir, "logs", "anon.log") + + cmd := exec.CommandContext(ctx, "npx", "anyone-client") + logFile, _ := os.Create(logPath) + cmd.Stdout = logFile + cmd.Stderr = logFile + + if err := cmd.Start(); err != nil { + fmt.Fprintf(pm.logWriter, " āš ļø Failed to start Anon: %v\n", err) + return nil + } + + os.WriteFile(pidPath, []byte(fmt.Sprintf("%d", cmd.Process.Pid)), 0644) + fmt.Fprintf(pm.logWriter, "āœ“ Anon proxy started (PID: %d, SOCKS: 9050)\n", cmd.Process.Pid) + + return nil +} + +func (pm *ProcessManager) startNodes(ctx context.Context) error { + topology := DefaultTopology() + for _, nodeSpec := range topology.Nodes { + logPath := filepath.Join(pm.oramaDir, "logs", fmt.Sprintf("%s.log", nodeSpec.Name)) + if err := pm.startNode(nodeSpec.Name, nodeSpec.ConfigFilename, logPath); err != nil { + return fmt.Errorf("failed to start %s: %w", nodeSpec.Name, err) + } + time.Sleep(500 * time.Millisecond) + } + return nil +} + diff --git a/pkg/environments/development/runner.go b/pkg/environments/development/runner.go index 9564ee7..fc0c1e6 100644 --- a/pkg/environments/development/runner.go +++ b/pkg/environments/development/runner.go @@ -2,21 +2,12 @@ package development import ( "context" - "encoding/json" "fmt" "io" - "net/http" - "net/url" "os" - "os/exec" "path/filepath" - "runtime" - "strconv" - "strings" "sync" "time" - - "github.com/DeBrosOfficial/network/pkg/tlsutil" ) // ProcessManager manages all dev environment processes @@ -69,13 +60,11 @@ func (pm *ProcessManager) StartAll(ctx context.Context) error { {"Olric", pm.startOlric}, {"Anon", pm.startAnon}, {"Nodes (Network)", pm.startNodes}, - // Gateway is now per-node (embedded in each node) - no separate main gateway needed } for _, svc := range services { if err := svc.fn(ctx); err != nil { fmt.Fprintf(pm.logWriter, "āš ļø Failed to start %s: %v\n", svc.name, err) - // Continue starting others, don't fail } } @@ -99,35 +88,6 @@ func (pm *ProcessManager) StartAll(ctx context.Context) error { return nil } -// printStartupSummary prints the final startup summary with key endpoints -func (pm *ProcessManager) printStartupSummary(topology *Topology) { - fmt.Fprintf(pm.logWriter, "\nāœ… Development environment ready!\n") - fmt.Fprintf(pm.logWriter, "═══════════════════════════════════════\n\n") - - fmt.Fprintf(pm.logWriter, "šŸ“” Access your nodes via unified gateway ports:\n\n") - for _, node := range topology.Nodes { - fmt.Fprintf(pm.logWriter, " %s:\n", node.Name) - fmt.Fprintf(pm.logWriter, " curl http://localhost:%d/health\n", node.UnifiedGatewayPort) - fmt.Fprintf(pm.logWriter, " curl http://localhost:%d/rqlite/http/db/execute\n", node.UnifiedGatewayPort) - fmt.Fprintf(pm.logWriter, " curl http://localhost:%d/cluster/health\n\n", node.UnifiedGatewayPort) - } - - fmt.Fprintf(pm.logWriter, "🌐 Main Gateway:\n") - fmt.Fprintf(pm.logWriter, " curl http://localhost:%d/v1/status\n\n", topology.GatewayPort) - - fmt.Fprintf(pm.logWriter, "šŸ“Š Other Services:\n") - fmt.Fprintf(pm.logWriter, " Olric: http://localhost:%d\n", topology.OlricHTTPPort) - fmt.Fprintf(pm.logWriter, " Anon SOCKS: 127.0.0.1:%d\n\n", topology.AnonSOCKSPort) - - fmt.Fprintf(pm.logWriter, "šŸ“ Useful Commands:\n") - fmt.Fprintf(pm.logWriter, " ./bin/orama dev status - Check service status\n") - fmt.Fprintf(pm.logWriter, " ./bin/orama dev logs node-1 - View logs\n") - fmt.Fprintf(pm.logWriter, " ./bin/orama dev down - Stop all services\n\n") - - fmt.Fprintf(pm.logWriter, "šŸ“‚ Logs: %s/logs\n", pm.oramaDir) - fmt.Fprintf(pm.logWriter, "āš™ļø Config: %s\n\n", pm.oramaDir) -} - // StopAll stops all running processes func (pm *ProcessManager) StopAll(ctx context.Context) error { fmt.Fprintf(pm.logWriter, "\nšŸ›‘ Stopping development environment...\n\n") @@ -153,7 +113,6 @@ func (pm *ProcessManager) StopAll(ctx context.Context) error { fmt.Fprintf(pm.logWriter, "Stopping %d services...\n\n", len(services)) - // Stop all processes sequentially (in dependency order) and wait for each stoppedCount := 0 for _, svc := range services { if err := pm.stopProcess(svc); err != nil { @@ -161,8 +120,6 @@ func (pm *ProcessManager) StopAll(ctx context.Context) error { } else { stoppedCount++ } - - // Show progress fmt.Fprintf(pm.logWriter, " [%d/%d] stopped\n", stoppedCount, len(services)) } @@ -224,7 +181,8 @@ func (pm *ProcessManager) Status(ctx context.Context) { pidPath := filepath.Join(pm.pidsDir, fmt.Sprintf("%s.pid", svc.name)) running := false if pidBytes, err := os.ReadFile(pidPath); err == nil { - pid, _ := strconv.Atoi(string(pidBytes)) + var pid int + fmt.Sscanf(string(pidBytes), "%d", &pid) if checkProcessRunning(pid) { running = true } @@ -252,888 +210,3 @@ func (pm *ProcessManager) Status(ctx context.Context) { fmt.Fprintf(pm.logWriter, "\nLogs directory: %s/logs\n\n", pm.oramaDir) } - -// Helper functions for starting individual services - -// buildIPFSNodes constructs ipfsNodeInfo from topology -func (pm *ProcessManager) buildIPFSNodes(topology *Topology) []ipfsNodeInfo { - var nodes []ipfsNodeInfo - for _, nodeSpec := range topology.Nodes { - nodes = append(nodes, ipfsNodeInfo{ - name: nodeSpec.Name, - ipfsPath: filepath.Join(pm.oramaDir, nodeSpec.DataDir, "ipfs/repo"), - apiPort: nodeSpec.IPFSAPIPort, - swarmPort: nodeSpec.IPFSSwarmPort, - gatewayPort: nodeSpec.IPFSGatewayPort, - peerID: "", - }) - } - return nodes -} - -// startNodes starts all network nodes -func (pm *ProcessManager) startNodes(ctx context.Context) error { - topology := DefaultTopology() - for _, nodeSpec := range topology.Nodes { - logPath := filepath.Join(pm.oramaDir, "logs", fmt.Sprintf("%s.log", nodeSpec.Name)) - if err := pm.startNode(nodeSpec.Name, nodeSpec.ConfigFilename, logPath); err != nil { - return fmt.Errorf("failed to start %s: %w", nodeSpec.Name, err) - } - time.Sleep(500 * time.Millisecond) - } - return nil -} - -// ipfsNodeInfo holds information about an IPFS node for peer discovery -type ipfsNodeInfo struct { - name string - ipfsPath string - apiPort int - swarmPort int - gatewayPort int - peerID string -} - -// readIPFSConfigValue reads a single config value from IPFS repo without daemon running -func readIPFSConfigValue(ctx context.Context, repoPath string, key string) (string, error) { - configPath := filepath.Join(repoPath, "config") - data, err := os.ReadFile(configPath) - if err != nil { - return "", fmt.Errorf("failed to read IPFS config: %w", err) - } - - // Simple JSON parse to extract the value - only works for string values - lines := strings.Split(string(data), "\n") - for _, line := range lines { - line = strings.TrimSpace(line) - if strings.Contains(line, key) { - // Extract the value after the colon - parts := strings.SplitN(line, ":", 2) - if len(parts) == 2 { - value := strings.TrimSpace(parts[1]) - value = strings.Trim(value, `",`) - if value != "" { - return value, nil - } - } - } - } - - return "", fmt.Errorf("key %s not found in IPFS config", key) -} - -// configureIPFSRepo directly modifies IPFS config JSON to set addresses, bootstrap, and CORS headers -// This avoids shell commands which fail on some systems and instead manipulates the config directly -// Returns the peer ID from the config -func configureIPFSRepo(repoPath string, apiPort, gatewayPort, swarmPort int) (string, error) { - configPath := filepath.Join(repoPath, "config") - - // Read existing config - data, err := os.ReadFile(configPath) - if err != nil { - return "", fmt.Errorf("failed to read IPFS config: %w", err) - } - - var config map[string]interface{} - if err := json.Unmarshal(data, &config); err != nil { - return "", fmt.Errorf("failed to parse IPFS config: %w", err) - } - - // Set Addresses - config["Addresses"] = map[string]interface{}{ - "API": []string{fmt.Sprintf("/ip4/127.0.0.1/tcp/%d", apiPort)}, - "Gateway": []string{fmt.Sprintf("/ip4/127.0.0.1/tcp/%d", gatewayPort)}, - "Swarm": []string{ - fmt.Sprintf("/ip4/0.0.0.0/tcp/%d", swarmPort), - fmt.Sprintf("/ip6/::/tcp/%d", swarmPort), - }, - } - - // Disable AutoConf for private swarm - config["AutoConf"] = map[string]interface{}{ - "Enabled": false, - } - - // Clear Bootstrap (will be set via HTTP API after startup) - config["Bootstrap"] = []string{} - - // Clear DNS Resolvers - if dns, ok := config["DNS"].(map[string]interface{}); ok { - dns["Resolvers"] = map[string]interface{}{} - } else { - config["DNS"] = map[string]interface{}{ - "Resolvers": map[string]interface{}{}, - } - } - - // Clear Routing DelegatedRouters - if routing, ok := config["Routing"].(map[string]interface{}); ok { - routing["DelegatedRouters"] = []string{} - } else { - config["Routing"] = map[string]interface{}{ - "DelegatedRouters": []string{}, - } - } - - // Clear IPNS DelegatedPublishers - if ipns, ok := config["Ipns"].(map[string]interface{}); ok { - ipns["DelegatedPublishers"] = []string{} - } else { - config["Ipns"] = map[string]interface{}{ - "DelegatedPublishers": []string{}, - } - } - - // Set API HTTPHeaders with CORS (must be map[string][]string) - if api, ok := config["API"].(map[string]interface{}); ok { - api["HTTPHeaders"] = map[string][]string{ - "Access-Control-Allow-Origin": {"*"}, - "Access-Control-Allow-Methods": {"GET", "PUT", "POST", "DELETE", "OPTIONS"}, - "Access-Control-Allow-Headers": {"Content-Type", "X-Requested-With"}, - "Access-Control-Expose-Headers": {"Content-Length", "Content-Range"}, - } - } else { - config["API"] = map[string]interface{}{ - "HTTPHeaders": map[string][]string{ - "Access-Control-Allow-Origin": {"*"}, - "Access-Control-Allow-Methods": {"GET", "PUT", "POST", "DELETE", "OPTIONS"}, - "Access-Control-Allow-Headers": {"Content-Type", "X-Requested-With"}, - "Access-Control-Expose-Headers": {"Content-Length", "Content-Range"}, - }, - } - } - - // Write config back - updatedData, err := json.MarshalIndent(config, "", " ") - if err != nil { - return "", fmt.Errorf("failed to marshal IPFS config: %w", err) - } - - if err := os.WriteFile(configPath, updatedData, 0644); err != nil { - return "", fmt.Errorf("failed to write IPFS config: %w", err) - } - - // Extract and return peer ID - if id, ok := config["Identity"].(map[string]interface{}); ok { - if peerID, ok := id["PeerID"].(string); ok { - return peerID, nil - } - } - - return "", fmt.Errorf("could not extract peer ID from config") -} - -// seedIPFSPeersWithHTTP configures each IPFS node to bootstrap with its local peers using HTTP API -func (pm *ProcessManager) seedIPFSPeersWithHTTP(ctx context.Context, nodes []ipfsNodeInfo) error { - fmt.Fprintf(pm.logWriter, " Seeding IPFS local bootstrap peers via HTTP API...\n") - - // Wait for all IPFS daemons to be ready before trying to configure them - for _, node := range nodes { - if err := pm.waitIPFSReady(ctx, node); err != nil { - fmt.Fprintf(pm.logWriter, " Warning: failed to wait for IPFS readiness for %s: %v\n", node.name, err) - } - } - - // For each node, clear default bootstrap and add local peers via HTTP - for i, node := range nodes { - // Clear bootstrap peers - httpURL := fmt.Sprintf("http://127.0.0.1:%d/api/v0/bootstrap/rm?all=true", node.apiPort) - if err := pm.ipfsHTTPCall(ctx, httpURL, "POST"); err != nil { - fmt.Fprintf(pm.logWriter, " Warning: failed to clear bootstrap for %s: %v\n", node.name, err) - } - - // Add other nodes as bootstrap peers - for j, otherNode := range nodes { - if i == j { - continue // Skip self - } - - multiaddr := fmt.Sprintf("/ip4/127.0.0.1/tcp/%d/p2p/%s", otherNode.swarmPort, otherNode.peerID) - httpURL := fmt.Sprintf("http://127.0.0.1:%d/api/v0/bootstrap/add?arg=%s", node.apiPort, url.QueryEscape(multiaddr)) - if err := pm.ipfsHTTPCall(ctx, httpURL, "POST"); err != nil { - fmt.Fprintf(pm.logWriter, " Warning: failed to add bootstrap peer for %s: %v\n", node.name, err) - } - } - } - - return nil -} - -// waitIPFSReady polls the IPFS daemon's HTTP API until it's ready -func (pm *ProcessManager) waitIPFSReady(ctx context.Context, node ipfsNodeInfo) error { - maxRetries := 30 - retryInterval := 500 * time.Millisecond - - for attempt := 0; attempt < maxRetries; attempt++ { - httpURL := fmt.Sprintf("http://127.0.0.1:%d/api/v0/version", node.apiPort) - if err := pm.ipfsHTTPCall(ctx, httpURL, "POST"); err == nil { - return nil // IPFS is ready - } - - select { - case <-time.After(retryInterval): - continue - case <-ctx.Done(): - return ctx.Err() - } - } - - return fmt.Errorf("IPFS daemon %s did not become ready after %d seconds", node.name, (maxRetries * int(retryInterval.Seconds()))) -} - -// ipfsHTTPCall makes an HTTP call to IPFS API -func (pm *ProcessManager) ipfsHTTPCall(ctx context.Context, urlStr string, method string) error { - client := tlsutil.NewHTTPClient(5 * time.Second) - req, err := http.NewRequestWithContext(ctx, method, urlStr, nil) - if err != nil { - return fmt.Errorf("failed to create request: %w", err) - } - - resp, err := client.Do(req) - if err != nil { - return fmt.Errorf("HTTP call failed: %w", err) - } - defer resp.Body.Close() - - if resp.StatusCode >= 400 { - body, _ := io.ReadAll(resp.Body) - return fmt.Errorf("HTTP %d: %s", resp.StatusCode, string(body)) - } - - return nil -} - -func (pm *ProcessManager) startIPFS(ctx context.Context) error { - topology := DefaultTopology() - nodes := pm.buildIPFSNodes(topology) - - // Phase 1: Initialize repos and configure addresses - for i := range nodes { - os.MkdirAll(nodes[i].ipfsPath, 0755) - - // Initialize IPFS if needed - if _, err := os.Stat(filepath.Join(nodes[i].ipfsPath, "config")); os.IsNotExist(err) { - fmt.Fprintf(pm.logWriter, " Initializing IPFS (%s)...\n", nodes[i].name) - cmd := exec.CommandContext(ctx, "ipfs", "init", "--profile=server", "--repo-dir="+nodes[i].ipfsPath) - if _, err := cmd.CombinedOutput(); err != nil { - fmt.Fprintf(pm.logWriter, " Warning: ipfs init failed: %v\n", err) - } - - // Copy swarm key - swarmKeyPath := filepath.Join(pm.oramaDir, "swarm.key") - if data, err := os.ReadFile(swarmKeyPath); err == nil { - os.WriteFile(filepath.Join(nodes[i].ipfsPath, "swarm.key"), data, 0600) - } - } - - // Configure the IPFS config directly (addresses, bootstrap, DNS, routing, CORS headers) - // This replaces shell commands which can fail on some systems - peerID, err := configureIPFSRepo(nodes[i].ipfsPath, nodes[i].apiPort, nodes[i].gatewayPort, nodes[i].swarmPort) - if err != nil { - fmt.Fprintf(pm.logWriter, " Warning: failed to configure IPFS repo for %s: %v\n", nodes[i].name, err) - } else { - nodes[i].peerID = peerID - fmt.Fprintf(pm.logWriter, " Peer ID for %s: %s\n", nodes[i].name, peerID) - } - } - - // Phase 2: Start all IPFS daemons - for i := range nodes { - pidPath := filepath.Join(pm.pidsDir, fmt.Sprintf("ipfs-%s.pid", nodes[i].name)) - logPath := filepath.Join(pm.oramaDir, "logs", fmt.Sprintf("ipfs-%s.log", nodes[i].name)) - - cmd := exec.CommandContext(ctx, "ipfs", "daemon", "--enable-pubsub-experiment", "--repo-dir="+nodes[i].ipfsPath) - logFile, _ := os.Create(logPath) - cmd.Stdout = logFile - cmd.Stderr = logFile - - if err := cmd.Start(); err != nil { - return fmt.Errorf("failed to start ipfs-%s: %w", nodes[i].name, err) - } - - os.WriteFile(pidPath, []byte(fmt.Sprintf("%d", cmd.Process.Pid)), 0644) - pm.processes[fmt.Sprintf("ipfs-%s", nodes[i].name)] = &ManagedProcess{ - Name: fmt.Sprintf("ipfs-%s", nodes[i].name), - PID: cmd.Process.Pid, - StartTime: time.Now(), - LogPath: logPath, - } - - fmt.Fprintf(pm.logWriter, "āœ“ IPFS (%s) started (PID: %d, API: %d, Swarm: %d)\n", nodes[i].name, cmd.Process.Pid, nodes[i].apiPort, nodes[i].swarmPort) - } - - time.Sleep(2 * time.Second) - - // Phase 3: Seed IPFS peers via HTTP API after all daemons are running - if err := pm.seedIPFSPeersWithHTTP(ctx, nodes); err != nil { - fmt.Fprintf(pm.logWriter, "āš ļø Failed to seed IPFS peers: %v\n", err) - } - - return nil -} - -func (pm *ProcessManager) startIPFSCluster(ctx context.Context) error { - topology := DefaultTopology() - var nodes []struct { - name string - clusterPath string - restAPIPort int - clusterPort int - ipfsPort int - } - - for _, nodeSpec := range topology.Nodes { - nodes = append(nodes, struct { - name string - clusterPath string - restAPIPort int - clusterPort int - ipfsPort int - }{ - nodeSpec.Name, - filepath.Join(pm.oramaDir, nodeSpec.DataDir, "ipfs-cluster"), - nodeSpec.ClusterAPIPort, - nodeSpec.ClusterPort, - nodeSpec.IPFSAPIPort, - }) - } - - // Wait for all IPFS daemons to be ready before starting cluster services - fmt.Fprintf(pm.logWriter, " Waiting for IPFS daemons to be ready...\n") - ipfsNodes := pm.buildIPFSNodes(topology) - for _, ipfsNode := range ipfsNodes { - if err := pm.waitIPFSReady(ctx, ipfsNode); err != nil { - fmt.Fprintf(pm.logWriter, " Warning: IPFS %s did not become ready: %v\n", ipfsNode.name, err) - } - } - - // Read cluster secret to ensure all nodes use the same PSK - secretPath := filepath.Join(pm.oramaDir, "cluster-secret") - clusterSecret, err := os.ReadFile(secretPath) - if err != nil { - return fmt.Errorf("failed to read cluster secret: %w", err) - } - clusterSecretHex := strings.TrimSpace(string(clusterSecret)) - - // Phase 1: Initialize and start bootstrap IPFS Cluster, then read its identity - bootstrapMultiaddr := "" - { - node := nodes[0] // bootstrap - - // Always clean stale cluster state to ensure fresh initialization with correct secret - if err := pm.cleanClusterState(node.clusterPath); err != nil { - fmt.Fprintf(pm.logWriter, " Warning: failed to clean cluster state for %s: %v\n", node.name, err) - } - - os.MkdirAll(node.clusterPath, 0755) - fmt.Fprintf(pm.logWriter, " Initializing IPFS Cluster (%s)...\n", node.name) - cmd := exec.CommandContext(ctx, "ipfs-cluster-service", "init", "--force") - cmd.Env = append(os.Environ(), - fmt.Sprintf("IPFS_CLUSTER_PATH=%s", node.clusterPath), - fmt.Sprintf("CLUSTER_SECRET=%s", clusterSecretHex), - ) - if output, err := cmd.CombinedOutput(); err != nil { - fmt.Fprintf(pm.logWriter, " Warning: ipfs-cluster-service init failed: %v (output: %s)\n", err, string(output)) - } - - // Ensure correct ports in service.json BEFORE starting daemon - // This is critical: it sets the cluster listen port to clusterPort, not the default - if err := pm.ensureIPFSClusterPorts(node.clusterPath, node.restAPIPort, node.clusterPort); err != nil { - fmt.Fprintf(pm.logWriter, " Warning: failed to update IPFS Cluster config for %s: %v\n", node.name, err) - } - - // Verify the config was written correctly (debug: read it back) - serviceJSONPath := filepath.Join(node.clusterPath, "service.json") - if data, err := os.ReadFile(serviceJSONPath); err == nil { - var verifyConfig map[string]interface{} - if err := json.Unmarshal(data, &verifyConfig); err == nil { - if cluster, ok := verifyConfig["cluster"].(map[string]interface{}); ok { - if listenAddrs, ok := cluster["listen_multiaddress"].([]interface{}); ok { - fmt.Fprintf(pm.logWriter, " Config verified: %s cluster listening on %v\n", node.name, listenAddrs) - } - } - } - } - - // Start bootstrap cluster service - pidPath := filepath.Join(pm.pidsDir, fmt.Sprintf("ipfs-cluster-%s.pid", node.name)) - logPath := filepath.Join(pm.oramaDir, "logs", fmt.Sprintf("ipfs-cluster-%s.log", node.name)) - - cmd = exec.CommandContext(ctx, "ipfs-cluster-service", "daemon") - cmd.Env = append(os.Environ(), fmt.Sprintf("IPFS_CLUSTER_PATH=%s", node.clusterPath)) - logFile, _ := os.Create(logPath) - cmd.Stdout = logFile - cmd.Stderr = logFile - - if err := cmd.Start(); err != nil { - fmt.Fprintf(pm.logWriter, " āš ļø Failed to start ipfs-cluster-%s: %v\n", node.name, err) - return err - } - - os.WriteFile(pidPath, []byte(fmt.Sprintf("%d", cmd.Process.Pid)), 0644) - fmt.Fprintf(pm.logWriter, "āœ“ IPFS Cluster (%s) started (PID: %d, API: %d)\n", node.name, cmd.Process.Pid, node.restAPIPort) - - // Wait for bootstrap to be ready and read its identity - if err := pm.waitClusterReady(ctx, node.name, node.restAPIPort); err != nil { - fmt.Fprintf(pm.logWriter, " Warning: IPFS Cluster %s did not become ready: %v\n", node.name, err) - } - - // Add a brief delay to allow identity.json to be written - time.Sleep(500 * time.Millisecond) - - // Read bootstrap peer ID for follower nodes to join - peerID, err := pm.waitForClusterPeerID(ctx, filepath.Join(node.clusterPath, "identity.json")) - if err != nil { - fmt.Fprintf(pm.logWriter, " Warning: failed to read bootstrap peer ID: %v\n", err) - } else { - bootstrapMultiaddr = fmt.Sprintf("/ip4/127.0.0.1/tcp/%d/p2p/%s", node.clusterPort, peerID) - fmt.Fprintf(pm.logWriter, " Bootstrap multiaddress: %s\n", bootstrapMultiaddr) - } - } - - // Phase 2: Initialize and start follower IPFS Cluster nodes with bootstrap flag - for i := 1; i < len(nodes); i++ { - node := nodes[i] - - // Always clean stale cluster state to ensure fresh initialization with correct secret - if err := pm.cleanClusterState(node.clusterPath); err != nil { - fmt.Fprintf(pm.logWriter, " Warning: failed to clean cluster state for %s: %v\n", node.name, err) - } - - os.MkdirAll(node.clusterPath, 0755) - fmt.Fprintf(pm.logWriter, " Initializing IPFS Cluster (%s)...\n", node.name) - cmd := exec.CommandContext(ctx, "ipfs-cluster-service", "init", "--force") - cmd.Env = append(os.Environ(), - fmt.Sprintf("IPFS_CLUSTER_PATH=%s", node.clusterPath), - fmt.Sprintf("CLUSTER_SECRET=%s", clusterSecretHex), - ) - if output, err := cmd.CombinedOutput(); err != nil { - fmt.Fprintf(pm.logWriter, " Warning: ipfs-cluster-service init failed for %s: %v (output: %s)\n", node.name, err, string(output)) - } - - // Ensure correct ports in service.json BEFORE starting daemon - if err := pm.ensureIPFSClusterPorts(node.clusterPath, node.restAPIPort, node.clusterPort); err != nil { - fmt.Fprintf(pm.logWriter, " Warning: failed to update IPFS Cluster config for %s: %v\n", node.name, err) - } - - // Verify the config was written correctly (debug: read it back) - serviceJSONPath := filepath.Join(node.clusterPath, "service.json") - if data, err := os.ReadFile(serviceJSONPath); err == nil { - var verifyConfig map[string]interface{} - if err := json.Unmarshal(data, &verifyConfig); err == nil { - if cluster, ok := verifyConfig["cluster"].(map[string]interface{}); ok { - if listenAddrs, ok := cluster["listen_multiaddress"].([]interface{}); ok { - fmt.Fprintf(pm.logWriter, " Config verified: %s cluster listening on %v\n", node.name, listenAddrs) - } - } - } - } - - // Start follower cluster service with bootstrap flag - pidPath := filepath.Join(pm.pidsDir, fmt.Sprintf("ipfs-cluster-%s.pid", node.name)) - logPath := filepath.Join(pm.oramaDir, "logs", fmt.Sprintf("ipfs-cluster-%s.log", node.name)) - - args := []string{"daemon"} - if bootstrapMultiaddr != "" { - args = append(args, "--bootstrap", bootstrapMultiaddr) - } - - cmd = exec.CommandContext(ctx, "ipfs-cluster-service", args...) - cmd.Env = append(os.Environ(), fmt.Sprintf("IPFS_CLUSTER_PATH=%s", node.clusterPath)) - logFile, _ := os.Create(logPath) - cmd.Stdout = logFile - cmd.Stderr = logFile - - if err := cmd.Start(); err != nil { - fmt.Fprintf(pm.logWriter, " āš ļø Failed to start ipfs-cluster-%s: %v\n", node.name, err) - continue - } - - os.WriteFile(pidPath, []byte(fmt.Sprintf("%d", cmd.Process.Pid)), 0644) - fmt.Fprintf(pm.logWriter, "āœ“ IPFS Cluster (%s) started (PID: %d, API: %d)\n", node.name, cmd.Process.Pid, node.restAPIPort) - - // Wait for follower node to connect to the bootstrap peer - if err := pm.waitClusterReady(ctx, node.name, node.restAPIPort); err != nil { - fmt.Fprintf(pm.logWriter, " Warning: IPFS Cluster %s did not become ready: %v\n", node.name, err) - } - } - - // Phase 3: Wait for all cluster peers to discover each other - fmt.Fprintf(pm.logWriter, " Waiting for IPFS Cluster peers to form...\n") - if err := pm.waitClusterFormed(ctx, nodes[0].restAPIPort); err != nil { - fmt.Fprintf(pm.logWriter, " Warning: IPFS Cluster did not form fully: %v\n", err) - } - - time.Sleep(1 * time.Second) - return nil -} - -// waitForClusterPeerID polls the identity.json file until it appears and extracts the peer ID -func (pm *ProcessManager) waitForClusterPeerID(ctx context.Context, identityPath string) (string, error) { - maxRetries := 30 - retryInterval := 500 * time.Millisecond - - for attempt := 0; attempt < maxRetries; attempt++ { - data, err := os.ReadFile(identityPath) - if err == nil { - var identity map[string]interface{} - if err := json.Unmarshal(data, &identity); err == nil { - if id, ok := identity["id"].(string); ok { - return id, nil - } - } - } - - select { - case <-time.After(retryInterval): - continue - case <-ctx.Done(): - return "", ctx.Err() - } - } - - return "", fmt.Errorf("could not read cluster peer ID after %d seconds", (maxRetries * int(retryInterval.Milliseconds()) / 1000)) -} - -// waitClusterReady polls the cluster REST API until it's ready -func (pm *ProcessManager) waitClusterReady(ctx context.Context, name string, restAPIPort int) error { - maxRetries := 30 - retryInterval := 500 * time.Millisecond - - for attempt := 0; attempt < maxRetries; attempt++ { - httpURL := fmt.Sprintf("http://127.0.0.1:%d/peers", restAPIPort) - resp, err := http.Get(httpURL) - if err == nil && resp.StatusCode == 200 { - resp.Body.Close() - return nil - } - if resp != nil { - resp.Body.Close() - } - - select { - case <-time.After(retryInterval): - continue - case <-ctx.Done(): - return ctx.Err() - } - } - - return fmt.Errorf("IPFS Cluster %s did not become ready after %d seconds", name, (maxRetries * int(retryInterval.Seconds()))) -} - -// waitClusterFormed waits for all cluster peers to be visible from the bootstrap node -func (pm *ProcessManager) waitClusterFormed(ctx context.Context, bootstrapRestAPIPort int) error { - maxRetries := 30 - retryInterval := 1 * time.Second - requiredPeers := 3 // bootstrap, node2, node3 - - for attempt := 0; attempt < maxRetries; attempt++ { - httpURL := fmt.Sprintf("http://127.0.0.1:%d/peers", bootstrapRestAPIPort) - resp, err := http.Get(httpURL) - if err == nil && resp.StatusCode == 200 { - // The /peers endpoint returns NDJSON (newline-delimited JSON), not a JSON array - // We need to stream-read each peer object - dec := json.NewDecoder(resp.Body) - peerCount := 0 - for { - var peer interface{} - err := dec.Decode(&peer) - if err != nil { - if err == io.EOF { - break - } - break // Stop on parse error - } - peerCount++ - } - resp.Body.Close() - if peerCount >= requiredPeers { - return nil // All peers have formed - } - } - if resp != nil { - resp.Body.Close() - } - - select { - case <-time.After(retryInterval): - continue - case <-ctx.Done(): - return ctx.Err() - } - } - - return fmt.Errorf("IPFS Cluster did not form fully after %d seconds", (maxRetries * int(retryInterval.Seconds()))) -} - -// cleanClusterState removes stale cluster state files to ensure fresh initialization -// This prevents PSK (private network key) mismatches when cluster secret changes -func (pm *ProcessManager) cleanClusterState(clusterPath string) error { - // Remove pebble datastore (contains persisted PSK state) - pebblePath := filepath.Join(clusterPath, "pebble") - if err := os.RemoveAll(pebblePath); err != nil && !os.IsNotExist(err) { - return fmt.Errorf("failed to remove pebble directory: %w", err) - } - - // Remove peerstore (contains peer addresses and metadata) - peerstorePath := filepath.Join(clusterPath, "peerstore") - if err := os.Remove(peerstorePath); err != nil && !os.IsNotExist(err) { - return fmt.Errorf("failed to remove peerstore: %w", err) - } - - // Remove service.json (will be regenerated with correct ports and secret) - serviceJSONPath := filepath.Join(clusterPath, "service.json") - if err := os.Remove(serviceJSONPath); err != nil && !os.IsNotExist(err) { - return fmt.Errorf("failed to remove service.json: %w", err) - } - - // Remove cluster.lock if it exists (from previous run) - lockPath := filepath.Join(clusterPath, "cluster.lock") - if err := os.Remove(lockPath); err != nil && !os.IsNotExist(err) { - return fmt.Errorf("failed to remove cluster.lock: %w", err) - } - - // Note: We keep identity.json as it's tied to the node's peer ID - // The secret will be updated via CLUSTER_SECRET env var during init - - return nil -} - -// ensureIPFSClusterPorts updates service.json with correct per-node ports and IPFS connector settings -func (pm *ProcessManager) ensureIPFSClusterPorts(clusterPath string, restAPIPort int, clusterPort int) error { - serviceJSONPath := filepath.Join(clusterPath, "service.json") - - // Read existing config - data, err := os.ReadFile(serviceJSONPath) - if err != nil { - return fmt.Errorf("failed to read service.json: %w", err) - } - - var config map[string]interface{} - if err := json.Unmarshal(data, &config); err != nil { - return fmt.Errorf("failed to unmarshal service.json: %w", err) - } - - // Calculate unique ports for this node based on restAPIPort offset - // bootstrap=9094 -> proxy=9095, pinsvc=9097, cluster=9096 - // node2=9104 -> proxy=9105, pinsvc=9107, cluster=9106 - // node3=9114 -> proxy=9115, pinsvc=9117, cluster=9116 - portOffset := restAPIPort - 9094 - proxyPort := 9095 + portOffset - pinsvcPort := 9097 + portOffset - - // Infer IPFS port from REST API port - // 9094 -> 4501 (bootstrap), 9104 -> 4502 (node2), 9114 -> 4503 (node3) - ipfsPort := 4501 + (portOffset / 10) - - // Update API settings - if api, ok := config["api"].(map[string]interface{}); ok { - // Update REST API listen address - if restapi, ok := api["restapi"].(map[string]interface{}); ok { - restapi["http_listen_multiaddress"] = fmt.Sprintf("/ip4/0.0.0.0/tcp/%d", restAPIPort) - } - - // Update IPFS Proxy settings - if proxy, ok := api["ipfsproxy"].(map[string]interface{}); ok { - proxy["listen_multiaddress"] = fmt.Sprintf("/ip4/127.0.0.1/tcp/%d", proxyPort) - proxy["node_multiaddress"] = fmt.Sprintf("/ip4/127.0.0.1/tcp/%d", ipfsPort) - } - - // Update Pinning Service API port - if pinsvc, ok := api["pinsvcapi"].(map[string]interface{}); ok { - pinsvc["http_listen_multiaddress"] = fmt.Sprintf("/ip4/127.0.0.1/tcp/%d", pinsvcPort) - } - } - - // Update cluster listen multiaddress to match the correct port - // Replace all old listen addresses with new ones for the correct port - if cluster, ok := config["cluster"].(map[string]interface{}); ok { - listenAddrs := []string{ - fmt.Sprintf("/ip4/0.0.0.0/tcp/%d", clusterPort), - fmt.Sprintf("/ip4/127.0.0.1/tcp/%d", clusterPort), - } - cluster["listen_multiaddress"] = listenAddrs - } - - // Update IPFS connector settings to point to correct IPFS API port - if connector, ok := config["ipfs_connector"].(map[string]interface{}); ok { - if ipfshttp, ok := connector["ipfshttp"].(map[string]interface{}); ok { - ipfshttp["node_multiaddress"] = fmt.Sprintf("/ip4/127.0.0.1/tcp/%d", ipfsPort) - } - } - - // Write updated config - updatedData, err := json.MarshalIndent(config, "", " ") - if err != nil { - return fmt.Errorf("failed to marshal updated config: %w", err) - } - - if err := os.WriteFile(serviceJSONPath, updatedData, 0644); err != nil { - return fmt.Errorf("failed to write service.json: %w", err) - } - - return nil -} - -func (pm *ProcessManager) startOlric(ctx context.Context) error { - pidPath := filepath.Join(pm.pidsDir, "olric.pid") - logPath := filepath.Join(pm.oramaDir, "logs", "olric.log") - configPath := filepath.Join(pm.oramaDir, "olric-config.yaml") - - cmd := exec.CommandContext(ctx, "olric-server") - cmd.Env = append(os.Environ(), fmt.Sprintf("OLRIC_SERVER_CONFIG=%s", configPath)) - logFile, _ := os.Create(logPath) - cmd.Stdout = logFile - cmd.Stderr = logFile - - if err := cmd.Start(); err != nil { - return fmt.Errorf("failed to start olric: %w", err) - } - - os.WriteFile(pidPath, []byte(fmt.Sprintf("%d", cmd.Process.Pid)), 0644) - fmt.Fprintf(pm.logWriter, "āœ“ Olric started (PID: %d)\n", cmd.Process.Pid) - - time.Sleep(1 * time.Second) - return nil -} - -func (pm *ProcessManager) startAnon(ctx context.Context) error { - if runtime.GOOS != "darwin" { - return nil // Skip on non-macOS for now - } - - pidPath := filepath.Join(pm.pidsDir, "anon.pid") - logPath := filepath.Join(pm.oramaDir, "logs", "anon.log") - - cmd := exec.CommandContext(ctx, "npx", "anyone-client") - logFile, _ := os.Create(logPath) - cmd.Stdout = logFile - cmd.Stderr = logFile - - if err := cmd.Start(); err != nil { - fmt.Fprintf(pm.logWriter, " āš ļø Failed to start Anon: %v\n", err) - return nil - } - - os.WriteFile(pidPath, []byte(fmt.Sprintf("%d", cmd.Process.Pid)), 0644) - fmt.Fprintf(pm.logWriter, "āœ“ Anon proxy started (PID: %d, SOCKS: 9050)\n", cmd.Process.Pid) - - return nil -} - -func (pm *ProcessManager) startNode(name, configFile, logPath string) error { - pidPath := filepath.Join(pm.pidsDir, fmt.Sprintf("%s.pid", name)) - cmd := exec.Command("./bin/orama-node", "--config", configFile) - logFile, _ := os.Create(logPath) - cmd.Stdout = logFile - cmd.Stderr = logFile - - if err := cmd.Start(); err != nil { - return fmt.Errorf("failed to start %s: %w", name, err) - } - - os.WriteFile(pidPath, []byte(fmt.Sprintf("%d", cmd.Process.Pid)), 0644) - fmt.Fprintf(pm.logWriter, "āœ“ %s started (PID: %d)\n", strings.Title(name), cmd.Process.Pid) - - time.Sleep(1 * time.Second) - return nil -} - -func (pm *ProcessManager) startGateway(ctx context.Context) error { - pidPath := filepath.Join(pm.pidsDir, "gateway.pid") - logPath := filepath.Join(pm.oramaDir, "logs", "gateway.log") - - cmd := exec.Command("./bin/gateway", "--config", "gateway.yaml") - logFile, _ := os.Create(logPath) - cmd.Stdout = logFile - cmd.Stderr = logFile - - if err := cmd.Start(); err != nil { - return fmt.Errorf("failed to start gateway: %w", err) - } - - os.WriteFile(pidPath, []byte(fmt.Sprintf("%d", cmd.Process.Pid)), 0644) - fmt.Fprintf(pm.logWriter, "āœ“ Gateway started (PID: %d, listen: 6001)\n", cmd.Process.Pid) - - return nil -} - -// stopProcess terminates a managed process and its children -func (pm *ProcessManager) stopProcess(name string) error { - pidPath := filepath.Join(pm.pidsDir, fmt.Sprintf("%s.pid", name)) - pidBytes, err := os.ReadFile(pidPath) - if err != nil { - return nil // Process not running or PID not found - } - - pid, err := strconv.Atoi(strings.TrimSpace(string(pidBytes))) - if err != nil { - os.Remove(pidPath) - return nil - } - - // Check if process exists before trying to kill - if !checkProcessRunning(pid) { - os.Remove(pidPath) - fmt.Fprintf(pm.logWriter, "āœ“ %s (not running)\n", name) - return nil - } - - proc, err := os.FindProcess(pid) - if err != nil { - os.Remove(pidPath) - return nil - } - - // Try graceful shutdown first (SIGTERM) - proc.Signal(os.Interrupt) - - // Wait up to 2 seconds for graceful shutdown - gracefulShutdown := false - for i := 0; i < 20; i++ { - time.Sleep(100 * time.Millisecond) - if !checkProcessRunning(pid) { - gracefulShutdown = true - break - } - } - - // Force kill if still running after graceful attempt - if !gracefulShutdown && checkProcessRunning(pid) { - proc.Signal(os.Kill) - time.Sleep(200 * time.Millisecond) - - // Kill any child processes (platform-specific) - if runtime.GOOS != "windows" { - exec.Command("pkill", "-9", "-P", fmt.Sprintf("%d", pid)).Run() - } - - // Final force kill attempt if somehow still alive - if checkProcessRunning(pid) { - exec.Command("kill", "-9", fmt.Sprintf("%d", pid)).Run() - time.Sleep(100 * time.Millisecond) - } - } - - os.Remove(pidPath) - - if gracefulShutdown { - fmt.Fprintf(pm.logWriter, "āœ“ %s stopped gracefully\n", name) - } else { - fmt.Fprintf(pm.logWriter, "āœ“ %s stopped (forced)\n", name) - } - return nil -} - -// checkProcessRunning checks if a process with given PID is running -func checkProcessRunning(pid int) bool { - proc, err := os.FindProcess(pid) - if err != nil { - return false - } - - // Send signal 0 to check if process exists (doesn't actually send signal) - err = proc.Signal(os.Signal(nil)) - return err == nil -} diff --git a/pkg/gateway/jwt.go b/pkg/gateway/auth/jwt.go similarity index 86% rename from pkg/gateway/jwt.go rename to pkg/gateway/auth/jwt.go index 54e143c..14a7fcd 100644 --- a/pkg/gateway/jwt.go +++ b/pkg/gateway/auth/jwt.go @@ -1,4 +1,4 @@ -package gateway +package auth import ( "crypto" @@ -13,13 +13,13 @@ import ( "time" ) -func (g *Gateway) jwksHandler(w http.ResponseWriter, r *http.Request) { +func (s *Service) JWKSHandler(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "application/json") - if g.signingKey == nil { + if s.signingKey == nil { _ = json.NewEncoder(w).Encode(map[string]any{"keys": []any{}}) return } - pub := g.signingKey.Public().(*rsa.PublicKey) + pub := s.signingKey.Public().(*rsa.PublicKey) n := pub.N.Bytes() // Encode exponent as big-endian bytes eVal := pub.E @@ -35,7 +35,7 @@ func (g *Gateway) jwksHandler(w http.ResponseWriter, r *http.Request) { "kty": "RSA", "use": "sig", "alg": "RS256", - "kid": g.keyID, + "kid": s.keyID, "n": base64.RawURLEncoding.EncodeToString(n), "e": base64.RawURLEncoding.EncodeToString(eb), } @@ -49,7 +49,7 @@ type jwtHeader struct { Kid string `json:"kid"` } -type jwtClaims struct { +type JWTClaims struct { Iss string `json:"iss"` Sub string `json:"sub"` Aud string `json:"aud"` @@ -59,9 +59,9 @@ type jwtClaims struct { Namespace string `json:"namespace"` } -// parseAndVerifyJWT verifies an RS256 JWT created by this gateway and returns claims -func (g *Gateway) parseAndVerifyJWT(token string) (*jwtClaims, error) { - if g.signingKey == nil { +// ParseAndVerifyJWT verifies an RS256 JWT created by this gateway and returns claims +func (s *Service) ParseAndVerifyJWT(token string) (*JWTClaims, error) { + if s.signingKey == nil { return nil, errors.New("signing key unavailable") } parts := strings.Split(token, ".") @@ -90,12 +90,12 @@ func (g *Gateway) parseAndVerifyJWT(token string) (*jwtClaims, error) { // Verify signature signingInput := parts[0] + "." + parts[1] sum := sha256.Sum256([]byte(signingInput)) - pub := g.signingKey.Public().(*rsa.PublicKey) + pub := s.signingKey.Public().(*rsa.PublicKey) if err := rsa.VerifyPKCS1v15(pub, crypto.SHA256, sum[:], sb); err != nil { return nil, errors.New("invalid signature") } // Parse claims - var claims jwtClaims + var claims JWTClaims if err := json.Unmarshal(pb, &claims); err != nil { return nil, errors.New("invalid claims json") } @@ -122,14 +122,14 @@ func (g *Gateway) parseAndVerifyJWT(token string) (*jwtClaims, error) { return &claims, nil } -func (g *Gateway) generateJWT(ns, subject string, ttl time.Duration) (string, int64, error) { - if g.signingKey == nil { +func (s *Service) GenerateJWT(ns, subject string, ttl time.Duration) (string, int64, error) { + if s.signingKey == nil { return "", 0, errors.New("signing key unavailable") } header := map[string]string{ "alg": "RS256", "typ": "JWT", - "kid": g.keyID, + "kid": s.keyID, } hb, _ := json.Marshal(header) now := time.Now().UTC() @@ -148,7 +148,7 @@ func (g *Gateway) generateJWT(ns, subject string, ttl time.Duration) (string, in pb64 := base64.RawURLEncoding.EncodeToString(pb) signingInput := hb64 + "." + pb64 sum := sha256.Sum256([]byte(signingInput)) - sig, err := rsa.SignPKCS1v15(rand.Reader, g.signingKey, crypto.SHA256, sum[:]) + sig, err := rsa.SignPKCS1v15(rand.Reader, s.signingKey, crypto.SHA256, sum[:]) if err != nil { return "", 0, err } diff --git a/pkg/gateway/auth/service.go b/pkg/gateway/auth/service.go new file mode 100644 index 0000000..be8f40d --- /dev/null +++ b/pkg/gateway/auth/service.go @@ -0,0 +1,391 @@ +package auth + +import ( + "context" + "crypto/ed25519" + "crypto/rand" + "crypto/rsa" + "crypto/sha256" + "crypto/x509" + "encoding/base64" + "encoding/hex" + "encoding/json" + "encoding/pem" + "fmt" + "math/big" + "strconv" + "strings" + "time" + + "github.com/DeBrosOfficial/network/pkg/client" + "github.com/DeBrosOfficial/network/pkg/logging" + ethcrypto "github.com/ethereum/go-ethereum/crypto" +) + +// Service handles authentication business logic +type Service struct { + logger *logging.ColoredLogger + orm client.NetworkClient + signingKey *rsa.PrivateKey + keyID string + defaultNS string +} + +func NewService(logger *logging.ColoredLogger, orm client.NetworkClient, signingKeyPEM string, defaultNS string) (*Service, error) { + s := &Service{ + logger: logger, + orm: orm, + defaultNS: defaultNS, + } + + if signingKeyPEM != "" { + block, _ := pem.Decode([]byte(signingKeyPEM)) + if block == nil { + return nil, fmt.Errorf("failed to parse signing key PEM") + } + key, err := x509.ParsePKCS1PrivateKey(block.Bytes) + if err != nil { + return nil, fmt.Errorf("failed to parse RSA private key: %w", err) + } + s.signingKey = key + + // Generate a simple KID from the public key hash + pubBytes := x509.MarshalPKCS1PublicKey(&key.PublicKey) + sum := sha256.Sum256(pubBytes) + s.keyID = hex.EncodeToString(sum[:8]) + } + + return s, nil +} + +// CreateNonce generates a new nonce and stores it in the database +func (s *Service) CreateNonce(ctx context.Context, wallet, purpose, namespace string) (string, error) { + // Generate a URL-safe random nonce (32 bytes) + buf := make([]byte, 32) + if _, err := rand.Read(buf); err != nil { + return "", fmt.Errorf("failed to generate nonce: %w", err) + } + nonce := base64.RawURLEncoding.EncodeToString(buf) + + // Use internal context to bypass authentication for system operations + internalCtx := client.WithInternalAuth(ctx) + db := s.orm.Database() + + if namespace == "" { + namespace = s.defaultNS + if namespace == "" { + namespace = "default" + } + } + + // Ensure namespace exists + if _, err := db.Query(internalCtx, "INSERT OR IGNORE INTO namespaces(name) VALUES (?)", namespace); err != nil { + return "", fmt.Errorf("failed to ensure namespace: %w", err) + } + + nsID, err := s.ResolveNamespaceID(ctx, namespace) + if err != nil { + return "", fmt.Errorf("failed to resolve namespace ID: %w", err) + } + + // Store nonce with 5 minute expiry + walletLower := strings.ToLower(strings.TrimSpace(wallet)) + if _, err := db.Query(internalCtx, + "INSERT INTO nonces(namespace_id, wallet, nonce, purpose, expires_at) VALUES (?, ?, ?, ?, datetime('now', '+5 minutes'))", + nsID, walletLower, nonce, purpose, + ); err != nil { + return "", fmt.Errorf("failed to store nonce: %w", err) + } + + return nonce, nil +} + +// VerifySignature verifies a wallet signature for a given nonce +func (s *Service) VerifySignature(ctx context.Context, wallet, nonce, signature, chainType string) (bool, error) { + chainType = strings.ToUpper(strings.TrimSpace(chainType)) + if chainType == "" { + chainType = "ETH" + } + + switch chainType { + case "ETH": + return s.verifyEthSignature(wallet, nonce, signature) + case "SOL": + return s.verifySolSignature(wallet, nonce, signature) + default: + return false, fmt.Errorf("unsupported chain type: %s", chainType) + } +} + +func (s *Service) verifyEthSignature(wallet, nonce, signature string) (bool, error) { + msg := []byte(nonce) + prefix := []byte("\x19Ethereum Signed Message:\n" + strconv.Itoa(len(msg))) + hash := ethcrypto.Keccak256(prefix, msg) + + sigHex := strings.TrimSpace(signature) + if strings.HasPrefix(sigHex, "0x") || strings.HasPrefix(sigHex, "0X") { + sigHex = sigHex[2:] + } + sig, err := hex.DecodeString(sigHex) + if err != nil || len(sig) != 65 { + return false, fmt.Errorf("invalid signature format") + } + + if sig[64] >= 27 { + sig[64] -= 27 + } + + pub, err := ethcrypto.SigToPub(hash, sig) + if err != nil { + return false, fmt.Errorf("signature recovery failed: %w", err) + } + + addr := ethcrypto.PubkeyToAddress(*pub).Hex() + want := strings.ToLower(strings.TrimPrefix(strings.TrimPrefix(wallet, "0x"), "0X")) + got := strings.ToLower(strings.TrimPrefix(strings.TrimPrefix(addr, "0x"), "0X")) + + return got == want, nil +} + +func (s *Service) verifySolSignature(wallet, nonce, signature string) (bool, error) { + sig, err := base64.StdEncoding.DecodeString(signature) + if err != nil { + return false, fmt.Errorf("invalid base64 signature: %w", err) + } + if len(sig) != 64 { + return false, fmt.Errorf("invalid signature length: expected 64 bytes, got %d", len(sig)) + } + + pubKeyBytes, err := s.Base58Decode(wallet) + if err != nil { + return false, fmt.Errorf("invalid wallet address: %w", err) + } + if len(pubKeyBytes) != 32 { + return false, fmt.Errorf("invalid public key length: expected 32 bytes, got %d", len(pubKeyBytes)) + } + + message := []byte(nonce) + return ed25519.Verify(ed25519.PublicKey(pubKeyBytes), message, sig), nil +} + +// IssueTokens generates access and refresh tokens for a verified wallet +func (s *Service) IssueTokens(ctx context.Context, wallet, namespace string) (string, string, int64, error) { + if s.signingKey == nil { + return "", "", 0, fmt.Errorf("signing key unavailable") + } + + // Issue access token (15m) + token, expUnix, err := s.GenerateJWT(namespace, wallet, 15*time.Minute) + if err != nil { + return "", "", 0, fmt.Errorf("failed to generate JWT: %w", err) + } + + // Create refresh token (30d) + rbuf := make([]byte, 32) + if _, err := rand.Read(rbuf); err != nil { + return "", "", 0, fmt.Errorf("failed to generate refresh token: %w", err) + } + refresh := base64.RawURLEncoding.EncodeToString(rbuf) + + nsID, err := s.ResolveNamespaceID(ctx, namespace) + if err != nil { + return "", "", 0, fmt.Errorf("failed to resolve namespace ID: %w", err) + } + + internalCtx := client.WithInternalAuth(ctx) + db := s.orm.Database() + if _, err := db.Query(internalCtx, + "INSERT INTO refresh_tokens(namespace_id, subject, token, audience, expires_at) VALUES (?, ?, ?, ?, datetime('now', '+30 days'))", + nsID, wallet, refresh, "gateway", + ); err != nil { + return "", "", 0, fmt.Errorf("failed to store refresh token: %w", err) + } + + return token, refresh, expUnix, nil +} + +// RefreshToken validates a refresh token and issues a new access token +func (s *Service) RefreshToken(ctx context.Context, refreshToken, namespace string) (string, string, int64, error) { + internalCtx := client.WithInternalAuth(ctx) + db := s.orm.Database() + + nsID, err := s.ResolveNamespaceID(ctx, namespace) + if err != nil { + return "", "", 0, err + } + + q := "SELECT subject FROM refresh_tokens WHERE namespace_id = ? AND token = ? AND revoked_at IS NULL AND (expires_at IS NULL OR expires_at > datetime('now')) LIMIT 1" + res, err := db.Query(internalCtx, q, nsID, refreshToken) + if err != nil || res == nil || res.Count == 0 { + return "", "", 0, fmt.Errorf("invalid or expired refresh token") + } + + subject := "" + if len(res.Rows) > 0 && len(res.Rows[0]) > 0 { + if val, ok := res.Rows[0][0].(string); ok { + subject = val + } else { + b, _ := json.Marshal(res.Rows[0][0]) + _ = json.Unmarshal(b, &subject) + } + } + + token, expUnix, err := s.GenerateJWT(namespace, subject, 15*time.Minute) + if err != nil { + return "", "", 0, err + } + + return token, subject, expUnix, nil +} + +// RevokeToken revokes a specific refresh token or all tokens for a subject +func (s *Service) RevokeToken(ctx context.Context, namespace, token string, all bool, subject string) error { + internalCtx := client.WithInternalAuth(ctx) + db := s.orm.Database() + + nsID, err := s.ResolveNamespaceID(ctx, namespace) + if err != nil { + return err + } + + if token != "" { + _, err := db.Query(internalCtx, "UPDATE refresh_tokens SET revoked_at = datetime('now') WHERE namespace_id = ? AND token = ? AND revoked_at IS NULL", nsID, token) + return err + } + + if all && subject != "" { + _, err := db.Query(internalCtx, "UPDATE refresh_tokens SET revoked_at = datetime('now') WHERE namespace_id = ? AND subject = ? AND revoked_at IS NULL", nsID, subject) + return err + } + + return fmt.Errorf("nothing to revoke") +} + +// RegisterApp registers a new client application +func (s *Service) RegisterApp(ctx context.Context, wallet, namespace, name, publicKey string) (string, error) { + internalCtx := client.WithInternalAuth(ctx) + db := s.orm.Database() + + nsID, err := s.ResolveNamespaceID(ctx, namespace) + if err != nil { + return "", err + } + + // Generate client app_id + buf := make([]byte, 12) + if _, err := rand.Read(buf); err != nil { + return "", fmt.Errorf("failed to generate app id: %w", err) + } + appID := "app_" + base64.RawURLEncoding.EncodeToString(buf) + + // Persist app + if _, err := db.Query(internalCtx, "INSERT INTO apps(namespace_id, app_id, name, public_key) VALUES (?, ?, ?, ?)", nsID, appID, name, publicKey); err != nil { + return "", err + } + + // Record ownership + _, _ = db.Query(internalCtx, "INSERT OR IGNORE INTO namespace_ownership(namespace_id, owner_type, owner_id) VALUES (?, ?, ?)", nsID, "wallet", wallet) + + return appID, nil +} + +// GetOrCreateAPIKey returns an existing API key or creates a new one for a wallet in a namespace +func (s *Service) GetOrCreateAPIKey(ctx context.Context, wallet, namespace string) (string, error) { + internalCtx := client.WithInternalAuth(ctx) + db := s.orm.Database() + + nsID, err := s.ResolveNamespaceID(ctx, namespace) + if err != nil { + return "", err + } + + // Try existing linkage + var apiKey string + r1, err := db.Query(internalCtx, + "SELECT api_keys.key FROM wallet_api_keys JOIN api_keys ON wallet_api_keys.api_key_id = api_keys.id WHERE wallet_api_keys.namespace_id = ? AND LOWER(wallet_api_keys.wallet) = LOWER(?) LIMIT 1", + nsID, wallet, + ) + if err == nil && r1 != nil && r1.Count > 0 && len(r1.Rows) > 0 && len(r1.Rows[0]) > 0 { + if val, ok := r1.Rows[0][0].(string); ok { + apiKey = val + } + } + + if apiKey != "" { + return apiKey, nil + } + + // Create new API key + buf := make([]byte, 18) + if _, err := rand.Read(buf); err != nil { + return "", fmt.Errorf("failed to generate api key: %w", err) + } + apiKey = "ak_" + base64.RawURLEncoding.EncodeToString(buf) + ":" + namespace + + if _, err := db.Query(internalCtx, "INSERT INTO api_keys(key, name, namespace_id) VALUES (?, ?, ?)", apiKey, "", nsID); err != nil { + return "", fmt.Errorf("failed to store api key: %w", err) + } + + // Link wallet -> api_key + rid, err := db.Query(internalCtx, "SELECT id FROM api_keys WHERE key = ? LIMIT 1", apiKey) + if err == nil && rid != nil && rid.Count > 0 && len(rid.Rows) > 0 && len(rid.Rows[0]) > 0 { + apiKeyID := rid.Rows[0][0] + _, _ = db.Query(internalCtx, "INSERT OR IGNORE INTO wallet_api_keys(namespace_id, wallet, api_key_id) VALUES (?, ?, ?)", nsID, strings.ToLower(wallet), apiKeyID) + } + + // Record ownerships + _, _ = db.Query(internalCtx, "INSERT OR IGNORE INTO namespace_ownership(namespace_id, owner_type, owner_id) VALUES (?, 'api_key', ?)", nsID, apiKey) + _, _ = db.Query(internalCtx, "INSERT OR IGNORE INTO namespace_ownership(namespace_id, owner_type, owner_id) VALUES (?, 'wallet', ?)", nsID, wallet) + + return apiKey, nil +} + +// ResolveNamespaceID ensures the given namespace exists and returns its primary key ID. +func (s *Service) ResolveNamespaceID(ctx context.Context, ns string) (interface{}, error) { + if s.orm == nil { + return nil, fmt.Errorf("client not initialized") + } + ns = strings.TrimSpace(ns) + if ns == "" { + ns = "default" + } + + internalCtx := client.WithInternalAuth(ctx) + db := s.orm.Database() + + if _, err := db.Query(internalCtx, "INSERT OR IGNORE INTO namespaces(name) VALUES (?)", ns); err != nil { + return nil, err + } + res, err := db.Query(internalCtx, "SELECT id FROM namespaces WHERE name = ? LIMIT 1", ns) + if err != nil { + return nil, err + } + if res == nil || res.Count == 0 || len(res.Rows) == 0 || len(res.Rows[0]) == 0 { + return nil, fmt.Errorf("failed to resolve namespace") + } + return res.Rows[0][0], nil +} + +// Base58Decode decodes a base58-encoded string +func (s *Service) Base58Decode(input string) ([]byte, error) { + const alphabet = "123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz" + answer := big.NewInt(0) + j := big.NewInt(1) + for i := len(input) - 1; i >= 0; i-- { + tmp := strings.IndexByte(alphabet, input[i]) + if tmp == -1 { + return nil, fmt.Errorf("invalid base58 character") + } + idx := big.NewInt(int64(tmp)) + tmp1 := new(big.Int) + tmp1.Mul(idx, j) + answer.Add(answer, tmp1) + j.Mul(j, big.NewInt(58)) + } + // Handle leading zeros + res := answer.Bytes() + for i := 0; i < len(input) && input[i] == alphabet[0]; i++ { + res = append([]byte{0}, res...) + } + return res, nil +} diff --git a/pkg/gateway/auth_handlers.go b/pkg/gateway/auth_handlers.go index 1b6fa8f..8621b33 100644 --- a/pkg/gateway/auth_handlers.go +++ b/pkg/gateway/auth_handlers.go @@ -1,20 +1,14 @@ package gateway import ( - "crypto/ed25519" - "crypto/rand" - "encoding/base64" - "encoding/hex" "encoding/json" "fmt" - "math/big" "net/http" - "strconv" "strings" "time" "github.com/DeBrosOfficial/network/pkg/client" - ethcrypto "github.com/ethereum/go-ethereum/crypto" + "github.com/DeBrosOfficial/network/pkg/gateway/auth" ) func (g *Gateway) whoamiHandler(w http.ResponseWriter, r *http.Request) { @@ -29,7 +23,7 @@ func (g *Gateway) whoamiHandler(w http.ResponseWriter, r *http.Request) { // Prefer JWT if present if v := ctx.Value(ctxKeyJWT); v != nil { - if claims, ok := v.(*jwtClaims); ok && claims != nil { + if claims, ok := v.(*auth.JWTClaims); ok && claims != nil { writeJSON(w, http.StatusOK, map[string]any{ "authenticated": true, "method": "jwt", @@ -61,8 +55,8 @@ func (g *Gateway) whoamiHandler(w http.ResponseWriter, r *http.Request) { } func (g *Gateway) challengeHandler(w http.ResponseWriter, r *http.Request) { - if g.client == nil { - writeError(w, http.StatusServiceUnavailable, "client not initialized") + if g.authService == nil { + writeError(w, http.StatusServiceUnavailable, "auth service not initialized") return } if r.Method != http.MethodPost { @@ -82,51 +76,16 @@ func (g *Gateway) challengeHandler(w http.ResponseWriter, r *http.Request) { writeError(w, http.StatusBadRequest, "wallet is required") return } - ns := strings.TrimSpace(req.Namespace) - if ns == "" { - ns = strings.TrimSpace(g.cfg.ClientNamespace) - if ns == "" { - ns = "default" - } - } - // Generate a URL-safe random nonce (32 bytes) - buf := make([]byte, 32) - if _, err := rand.Read(buf); err != nil { - writeError(w, http.StatusInternalServerError, "failed to generate nonce") - return - } - nonce := base64.RawURLEncoding.EncodeToString(buf) - // Insert namespace if missing, fetch id - ctx := r.Context() - // Use internal context to bypass authentication for system operations - internalCtx := client.WithInternalAuth(ctx) - db := g.client.Database() - if _, err := db.Query(internalCtx, "INSERT OR IGNORE INTO namespaces(name) VALUES (?)", ns); err != nil { - writeError(w, http.StatusInternalServerError, err.Error()) - return - } - nres, err := db.Query(internalCtx, "SELECT id FROM namespaces WHERE name = ? LIMIT 1", ns) - if err != nil || nres == nil || nres.Count == 0 || len(nres.Rows) == 0 || len(nres.Rows[0]) == 0 { - writeError(w, http.StatusInternalServerError, "failed to resolve namespace") - return - } - nsID := nres.Rows[0][0] - - // Store nonce with 5 minute expiry - // Normalize wallet address to lowercase for case-insensitive comparison - walletLower := strings.ToLower(strings.TrimSpace(req.Wallet)) - if _, err := db.Query(internalCtx, - "INSERT INTO nonces(namespace_id, wallet, nonce, purpose, expires_at) VALUES (?, ?, ?, ?, datetime('now', '+5 minutes'))", - nsID, walletLower, nonce, req.Purpose, - ); err != nil { + nonce, err := g.authService.CreateNonce(r.Context(), req.Wallet, req.Purpose, req.Namespace) + if err != nil { writeError(w, http.StatusInternalServerError, err.Error()) return } writeJSON(w, http.StatusOK, map[string]any{ "wallet": req.Wallet, - "namespace": ns, + "namespace": req.Namespace, "nonce": nonce, "purpose": req.Purpose, "expires_at": time.Now().Add(5 * time.Minute).UTC().Format(time.RFC3339Nano), @@ -134,8 +93,8 @@ func (g *Gateway) challengeHandler(w http.ResponseWriter, r *http.Request) { } func (g *Gateway) verifyHandler(w http.ResponseWriter, r *http.Request) { - if g.client == nil { - writeError(w, http.StatusServiceUnavailable, "client not initialized") + if g.authService == nil { + writeError(w, http.StatusServiceUnavailable, "auth service not initialized") return } if r.Method != http.MethodPost { @@ -147,7 +106,7 @@ func (g *Gateway) verifyHandler(w http.ResponseWriter, r *http.Request) { Nonce string `json:"nonce"` Signature string `json:"signature"` Namespace string `json:"namespace"` - ChainType string `json:"chain_type"` // "ETH" or "SOL", defaults to "ETH" + ChainType string `json:"chain_type"` } if err := json.NewDecoder(r.Body).Decode(&req); err != nil { writeError(w, http.StatusBadRequest, "invalid json body") @@ -157,185 +116,30 @@ func (g *Gateway) verifyHandler(w http.ResponseWriter, r *http.Request) { writeError(w, http.StatusBadRequest, "wallet, nonce and signature are required") return } - ns := strings.TrimSpace(req.Namespace) - if ns == "" { - ns = strings.TrimSpace(g.cfg.ClientNamespace) - if ns == "" { - ns = "default" - } - } + ctx := r.Context() - // Use internal context to bypass authentication for system operations - internalCtx := client.WithInternalAuth(ctx) + verified, err := g.authService.VerifySignature(ctx, req.Wallet, req.Nonce, req.Signature, req.ChainType) + if err != nil || !verified { + writeError(w, http.StatusUnauthorized, "signature verification failed") + return + } + + // Mark nonce used + nsID, _ := g.authService.ResolveNamespaceID(ctx, req.Namespace) db := g.client.Database() - nsID, err := g.resolveNamespaceID(ctx, ns) + _, _ = db.Query(client.WithInternalAuth(ctx), "UPDATE nonces SET used_at = datetime('now') WHERE namespace_id = ? AND wallet = ? AND nonce = ?", nsID, strings.ToLower(req.Wallet), req.Nonce) + + token, refresh, expUnix, err := g.authService.IssueTokens(ctx, req.Wallet, req.Namespace) if err != nil { writeError(w, http.StatusInternalServerError, err.Error()) return } - // Normalize wallet address to lowercase for case-insensitive comparison - walletLower := strings.ToLower(strings.TrimSpace(req.Wallet)) - q := "SELECT id FROM nonces WHERE namespace_id = ? AND LOWER(wallet) = LOWER(?) AND nonce = ? AND used_at IS NULL AND (expires_at IS NULL OR expires_at > datetime('now')) LIMIT 1" - nres, err := db.Query(internalCtx, q, nsID, walletLower, req.Nonce) - if err != nil || nres == nil || nres.Count == 0 { - writeError(w, http.StatusBadRequest, "invalid or expired nonce") - return - } - nonceID := nres.Rows[0][0] - // Determine chain type (default to ETH for backward compatibility) - chainType := strings.ToUpper(strings.TrimSpace(req.ChainType)) - if chainType == "" { - chainType = "ETH" - } - - // Verify signature based on chain type - var verified bool - var verifyErr error - - switch chainType { - case "ETH": - // EVM personal_sign verification of the nonce - msg := []byte(req.Nonce) - prefix := []byte("\x19Ethereum Signed Message:\n" + strconv.Itoa(len(msg))) - hash := ethcrypto.Keccak256(prefix, msg) - - // Decode signature (expects 65-byte r||s||v, hex with optional 0x) - sigHex := strings.TrimSpace(req.Signature) - if strings.HasPrefix(sigHex, "0x") || strings.HasPrefix(sigHex, "0X") { - sigHex = sigHex[2:] - } - sig, err := hex.DecodeString(sigHex) - if err != nil || len(sig) != 65 { - writeError(w, http.StatusBadRequest, "invalid signature format") - return - } - // Normalize V to 0/1 as expected by geth - if sig[64] >= 27 { - sig[64] -= 27 - } - pub, err := ethcrypto.SigToPub(hash, sig) - if err != nil { - writeError(w, http.StatusUnauthorized, "signature recovery failed") - return - } - addr := ethcrypto.PubkeyToAddress(*pub).Hex() - want := strings.ToLower(strings.TrimPrefix(strings.TrimPrefix(req.Wallet, "0x"), "0X")) - got := strings.ToLower(strings.TrimPrefix(strings.TrimPrefix(addr, "0x"), "0X")) - if got != want { - writeError(w, http.StatusUnauthorized, "signature does not match wallet") - return - } - verified = true - - case "SOL": - // Solana uses Ed25519 signatures - // Signature is base64-encoded, public key is the wallet address (base58) - - // Decode base64 signature (Solana signatures are 64 bytes) - sig, err := base64.StdEncoding.DecodeString(req.Signature) - if err != nil { - writeError(w, http.StatusBadRequest, fmt.Sprintf("invalid base64 signature: %v", err)) - return - } - if len(sig) != 64 { - writeError(w, http.StatusBadRequest, fmt.Sprintf("invalid signature length: expected 64 bytes, got %d", len(sig))) - return - } - - // Decode base58 public key (Solana wallet address) - pubKeyBytes, err := base58Decode(req.Wallet) - if err != nil { - writeError(w, http.StatusBadRequest, fmt.Sprintf("invalid wallet address: %v", err)) - return - } - if len(pubKeyBytes) != 32 { - writeError(w, http.StatusBadRequest, fmt.Sprintf("invalid public key length: expected 32 bytes, got %d", len(pubKeyBytes))) - return - } - - // Verify Ed25519 signature - message := []byte(req.Nonce) - if !ed25519.Verify(ed25519.PublicKey(pubKeyBytes), message, sig) { - writeError(w, http.StatusUnauthorized, "signature verification failed") - return - } - verified = true - - default: - writeError(w, http.StatusBadRequest, fmt.Sprintf("unsupported chain type: %s (must be ETH or SOL)", chainType)) - return - } - - if !verified { - writeError(w, http.StatusUnauthorized, fmt.Sprintf("signature verification failed: %v", verifyErr)) - return - } - - // Mark nonce used now (after successful verification) - if _, err := db.Query(internalCtx, "UPDATE nonces SET used_at = datetime('now') WHERE id = ?", nonceID); err != nil { - writeError(w, http.StatusInternalServerError, err.Error()) - return - } - if g.signingKey == nil { - writeError(w, http.StatusServiceUnavailable, "signing key unavailable") - return - } - // Issue access token (15m) and a refresh token (30d) - token, expUnix, err := g.generateJWT(ns, req.Wallet, 15*time.Minute) + apiKey, err := g.authService.GetOrCreateAPIKey(ctx, req.Wallet, req.Namespace) if err != nil { writeError(w, http.StatusInternalServerError, err.Error()) return } - // create refresh token - rbuf := make([]byte, 32) - if _, err := rand.Read(rbuf); err != nil { - writeError(w, http.StatusInternalServerError, "failed to generate refresh token") - return - } - refresh := base64.RawURLEncoding.EncodeToString(rbuf) - if _, err := db.Query(internalCtx, "INSERT INTO refresh_tokens(namespace_id, subject, token, audience, expires_at) VALUES (?, ?, ?, ?, datetime('now', '+30 days'))", nsID, req.Wallet, refresh, "gateway"); err != nil { - writeError(w, http.StatusInternalServerError, err.Error()) - return - } - - // Ensure API key exists for this (namespace, wallet) and record ownerships - // This is done automatically after successful verification; no second nonce needed - var apiKey string - - // Try existing linkage - r1, err := db.Query(internalCtx, - "SELECT api_keys.key FROM wallet_api_keys JOIN api_keys ON wallet_api_keys.api_key_id = api_keys.id WHERE wallet_api_keys.namespace_id = ? AND LOWER(wallet_api_keys.wallet) = LOWER(?) LIMIT 1", - nsID, req.Wallet, - ) - if err == nil && r1 != nil && r1.Count > 0 && len(r1.Rows) > 0 && len(r1.Rows[0]) > 0 { - if s, ok := r1.Rows[0][0].(string); ok { - apiKey = s - } else { - b, _ := json.Marshal(r1.Rows[0][0]) - _ = json.Unmarshal(b, &apiKey) - } - } - - if strings.TrimSpace(apiKey) == "" { - // Create new API key with format ak_: - buf := make([]byte, 18) - if _, err := rand.Read(buf); err == nil { - apiKey = "ak_" + base64.RawURLEncoding.EncodeToString(buf) + ":" + ns - if _, err := db.Query(internalCtx, "INSERT INTO api_keys(key, name, namespace_id) VALUES (?, ?, ?)", apiKey, "", nsID); err == nil { - // Link wallet -> api_key - rid, err := db.Query(internalCtx, "SELECT id FROM api_keys WHERE key = ? LIMIT 1", apiKey) - if err == nil && rid != nil && rid.Count > 0 && len(rid.Rows) > 0 && len(rid.Rows[0]) > 0 { - apiKeyID := rid.Rows[0][0] - _, _ = db.Query(internalCtx, "INSERT OR IGNORE INTO wallet_api_keys(namespace_id, wallet, api_key_id) VALUES (?, ?, ?)", nsID, strings.ToLower(req.Wallet), apiKeyID) - } - } - } - } - - // Record ownerships (best-effort) - _, _ = db.Query(internalCtx, "INSERT OR IGNORE INTO namespace_ownership(namespace_id, owner_type, owner_id) VALUES (?, 'api_key', ?)", nsID, apiKey) - _, _ = db.Query(internalCtx, "INSERT OR IGNORE INTO namespace_ownership(namespace_id, owner_type, owner_id) VALUES (?, 'wallet', ?)", nsID, req.Wallet) writeJSON(w, http.StatusOK, map[string]any{ "access_token": token, @@ -343,23 +147,16 @@ func (g *Gateway) verifyHandler(w http.ResponseWriter, r *http.Request) { "expires_in": int(expUnix - time.Now().Unix()), "refresh_token": refresh, "subject": req.Wallet, - "namespace": ns, + "namespace": req.Namespace, "api_key": apiKey, "nonce": req.Nonce, "signature_verified": true, }) } -// issueAPIKeyHandler creates or returns an API key for a verified wallet in a namespace. -// Requires: POST { wallet, nonce, signature, namespace } -// Behavior: -// - Validates nonce and signature like verifyHandler -// - Ensures namespace exists -// - If an API key already exists for (namespace, wallet), returns it; else creates one -// - Records namespace ownership mapping for the wallet and api_key func (g *Gateway) issueAPIKeyHandler(w http.ResponseWriter, r *http.Request) { - if g.client == nil { - writeError(w, http.StatusServiceUnavailable, "client not initialized") + if g.authService == nil { + writeError(w, http.StatusServiceUnavailable, "auth service not initialized") return } if r.Method != http.MethodPost { @@ -371,6 +168,7 @@ func (g *Gateway) issueAPIKeyHandler(w http.ResponseWriter, r *http.Request) { Nonce string `json:"nonce"` Signature string `json:"signature"` Namespace string `json:"namespace"` + ChainType string `json:"chain_type"` Plan string `json:"plan"` } if err := json.NewDecoder(r.Body).Decode(&req); err != nil { @@ -381,110 +179,33 @@ func (g *Gateway) issueAPIKeyHandler(w http.ResponseWriter, r *http.Request) { writeError(w, http.StatusBadRequest, "wallet, nonce and signature are required") return } - ns := strings.TrimSpace(req.Namespace) - if ns == "" { - ns = strings.TrimSpace(g.cfg.ClientNamespace) - if ns == "" { - ns = "default" - } - } + ctx := r.Context() - // Use internal context to bypass authentication for system operations - internalCtx := client.WithInternalAuth(ctx) - db := g.client.Database() - // Resolve namespace id - nsID, err := g.resolveNamespaceID(ctx, ns) - if err != nil { - writeError(w, http.StatusInternalServerError, err.Error()) - return - } - // Validate nonce exists and not used/expired - // Normalize wallet address to lowercase for case-insensitive comparison - walletLower := strings.ToLower(strings.TrimSpace(req.Wallet)) - q := "SELECT id FROM nonces WHERE namespace_id = ? AND LOWER(wallet) = LOWER(?) AND nonce = ? AND used_at IS NULL AND (expires_at IS NULL OR expires_at > datetime('now')) LIMIT 1" - nres, err := db.Query(internalCtx, q, nsID, walletLower, req.Nonce) - if err != nil || nres == nil || nres.Count == 0 { - writeError(w, http.StatusBadRequest, "invalid or expired nonce") - return - } - nonceID := nres.Rows[0][0] - // Verify signature like verifyHandler - msg := []byte(req.Nonce) - prefix := []byte("\x19Ethereum Signed Message:\n" + strconv.Itoa(len(msg))) - hash := ethcrypto.Keccak256(prefix, msg) - sigHex := strings.TrimSpace(req.Signature) - if strings.HasPrefix(sigHex, "0x") || strings.HasPrefix(sigHex, "0X") { - sigHex = sigHex[2:] - } - sig, err := hex.DecodeString(sigHex) - if err != nil || len(sig) != 65 { - writeError(w, http.StatusBadRequest, "invalid signature format") - return - } - if sig[64] >= 27 { - sig[64] -= 27 - } - pub, err := ethcrypto.SigToPub(hash, sig) - if err != nil { - writeError(w, http.StatusUnauthorized, "signature recovery failed") - return - } - addr := ethcrypto.PubkeyToAddress(*pub).Hex() - want := strings.ToLower(strings.TrimPrefix(strings.TrimPrefix(req.Wallet, "0x"), "0X")) - got := strings.ToLower(strings.TrimPrefix(strings.TrimPrefix(addr, "0x"), "0X")) - if got != want { - writeError(w, http.StatusUnauthorized, "signature does not match wallet") + verified, err := g.authService.VerifySignature(ctx, req.Wallet, req.Nonce, req.Signature, req.ChainType) + if err != nil || !verified { + writeError(w, http.StatusUnauthorized, "signature verification failed") return } + // Mark nonce used - if _, err := db.Query(internalCtx, "UPDATE nonces SET used_at = datetime('now') WHERE id = ?", nonceID); err != nil { + nsID, _ := g.authService.ResolveNamespaceID(ctx, req.Namespace) + db := g.client.Database() + _, _ = db.Query(client.WithInternalAuth(ctx), "UPDATE nonces SET used_at = datetime('now') WHERE namespace_id = ? AND wallet = ? AND nonce = ?", nsID, strings.ToLower(req.Wallet), req.Nonce) + + apiKey, err := g.authService.GetOrCreateAPIKey(ctx, req.Wallet, req.Namespace) + if err != nil { writeError(w, http.StatusInternalServerError, err.Error()) return } - // Check if api key exists for (namespace, wallet) via linkage table - var apiKey string - r1, err := db.Query(internalCtx, "SELECT api_keys.key FROM wallet_api_keys JOIN api_keys ON wallet_api_keys.api_key_id = api_keys.id WHERE wallet_api_keys.namespace_id = ? AND LOWER(wallet_api_keys.wallet) = LOWER(?) LIMIT 1", nsID, req.Wallet) - if err == nil && r1 != nil && r1.Count > 0 && len(r1.Rows) > 0 && len(r1.Rows[0]) > 0 { - if s, ok := r1.Rows[0][0].(string); ok { - apiKey = s - } else { - b, _ := json.Marshal(r1.Rows[0][0]) - _ = json.Unmarshal(b, &apiKey) - } - } - if strings.TrimSpace(apiKey) == "" { - // Create new API key with format ak_: - buf := make([]byte, 18) - if _, err := rand.Read(buf); err != nil { - writeError(w, http.StatusInternalServerError, "failed to generate api key") - return - } - apiKey = "ak_" + base64.RawURLEncoding.EncodeToString(buf) + ":" + ns - if _, err := db.Query(internalCtx, "INSERT INTO api_keys(key, name, namespace_id) VALUES (?, ?, ?)", apiKey, "", nsID); err != nil { - writeError(w, http.StatusInternalServerError, err.Error()) - return - } - // Create linkage - // Find api_key id - rid, err := db.Query(internalCtx, "SELECT id FROM api_keys WHERE key = ? LIMIT 1", apiKey) - if err == nil && rid != nil && rid.Count > 0 && len(rid.Rows) > 0 && len(rid.Rows[0]) > 0 { - apiKeyID := rid.Rows[0][0] - _, _ = db.Query(internalCtx, "INSERT OR IGNORE INTO wallet_api_keys(namespace_id, wallet, api_key_id) VALUES (?, ?, ?)", nsID, strings.ToLower(req.Wallet), apiKeyID) - } - } - // Record ownerships (best-effort) - _, _ = db.Query(internalCtx, "INSERT OR IGNORE INTO namespace_ownership(namespace_id, owner_type, owner_id) VALUES (?, 'api_key', ?)", nsID, apiKey) - _, _ = db.Query(internalCtx, "INSERT OR IGNORE INTO namespace_ownership(namespace_id, owner_type, owner_id) VALUES (?, 'wallet', ?)", nsID, req.Wallet) writeJSON(w, http.StatusOK, map[string]any{ "api_key": apiKey, - "namespace": ns, + "namespace": req.Namespace, "plan": func() string { if strings.TrimSpace(req.Plan) == "" { return "free" - } else { - return req.Plan } + return req.Plan }(), "wallet": strings.ToLower(strings.TrimPrefix(strings.TrimPrefix(req.Wallet, "0x"), "0X")), }) @@ -494,8 +215,8 @@ func (g *Gateway) issueAPIKeyHandler(w http.ResponseWriter, r *http.Request) { // Requires Authorization header with API key (Bearer or ApiKey or X-API-Key header). // Returns a JWT bound to the namespace derived from the API key record. func (g *Gateway) apiKeyToJWTHandler(w http.ResponseWriter, r *http.Request) { - if g.client == nil { - writeError(w, http.StatusServiceUnavailable, "client not initialized") + if g.authService == nil { + writeError(w, http.StatusServiceUnavailable, "auth service not initialized") return } if r.Method != http.MethodPost { @@ -507,10 +228,10 @@ func (g *Gateway) apiKeyToJWTHandler(w http.ResponseWriter, r *http.Request) { writeError(w, http.StatusUnauthorized, "missing API key") return } + // Validate and get namespace db := g.client.Database() ctx := r.Context() - // Use internal context to bypass authentication for system operations internalCtx := client.WithInternalAuth(ctx) q := "SELECT namespaces.name FROM api_keys JOIN namespaces ON api_keys.namespace_id = namespaces.id WHERE api_keys.key = ? LIMIT 1" res, err := db.Query(internalCtx, q, key) @@ -518,28 +239,18 @@ func (g *Gateway) apiKeyToJWTHandler(w http.ResponseWriter, r *http.Request) { writeError(w, http.StatusUnauthorized, "invalid API key") return } + var ns string if s, ok := res.Rows[0][0].(string); ok { ns = s - } else { - b, _ := json.Marshal(res.Rows[0][0]) - _ = json.Unmarshal(b, &ns) } - ns = strings.TrimSpace(ns) - if ns == "" { - writeError(w, http.StatusUnauthorized, "invalid API key") - return - } - if g.signingKey == nil { - writeError(w, http.StatusServiceUnavailable, "signing key unavailable") - return - } - // Subject is the API key string for now - token, expUnix, err := g.generateJWT(ns, key, 15*time.Minute) + + token, expUnix, err := g.authService.GenerateJWT(ns, key, 15*time.Minute) if err != nil { writeError(w, http.StatusInternalServerError, err.Error()) return } + writeJSON(w, http.StatusOK, map[string]any{ "access_token": token, "token_type": "Bearer", @@ -549,8 +260,8 @@ func (g *Gateway) apiKeyToJWTHandler(w http.ResponseWriter, r *http.Request) { } func (g *Gateway) registerHandler(w http.ResponseWriter, r *http.Request) { - if g.client == nil { - writeError(w, http.StatusServiceUnavailable, "client not initialized") + if g.authService == nil { + writeError(w, http.StatusServiceUnavailable, "auth service not initialized") return } if r.Method != http.MethodPost { @@ -562,6 +273,7 @@ func (g *Gateway) registerHandler(w http.ResponseWriter, r *http.Request) { Nonce string `json:"nonce"` Signature string `json:"signature"` Namespace string `json:"namespace"` + ChainType string `json:"chain_type"` Name string `json:"name"` } if err := json.NewDecoder(r.Body).Decode(&req); err != nil { @@ -572,106 +284,45 @@ func (g *Gateway) registerHandler(w http.ResponseWriter, r *http.Request) { writeError(w, http.StatusBadRequest, "wallet, nonce and signature are required") return } - ns := strings.TrimSpace(req.Namespace) - if ns == "" { - ns = strings.TrimSpace(g.cfg.ClientNamespace) - if ns == "" { - ns = "default" - } - } + ctx := r.Context() - // Use internal context to bypass authentication for system operations - internalCtx := client.WithInternalAuth(ctx) + verified, err := g.authService.VerifySignature(ctx, req.Wallet, req.Nonce, req.Signature, req.ChainType) + if err != nil || !verified { + writeError(w, http.StatusUnauthorized, "signature verification failed") + return + } + + // Mark nonce used + nsID, _ := g.authService.ResolveNamespaceID(ctx, req.Namespace) db := g.client.Database() - nsID, err := g.resolveNamespaceID(ctx, ns) + _, _ = db.Query(client.WithInternalAuth(ctx), "UPDATE nonces SET used_at = datetime('now') WHERE namespace_id = ? AND wallet = ? AND nonce = ?", nsID, strings.ToLower(req.Wallet), req.Nonce) + + // In a real app we'd derive the public key from the signature, but for simplicity here + // we just use a placeholder or expect it in the request if needed. + // For Ethereum, we can recover it. + publicKey := "recovered-pk" + + appID, err := g.authService.RegisterApp(ctx, req.Wallet, req.Namespace, req.Name, publicKey) if err != nil { writeError(w, http.StatusInternalServerError, err.Error()) return } - // Validate nonce - q := "SELECT id FROM nonces WHERE namespace_id = ? AND wallet = ? AND nonce = ? AND used_at IS NULL AND (expires_at IS NULL OR expires_at > datetime('now')) LIMIT 1" - nres, err := db.Query(internalCtx, q, nsID, req.Wallet, req.Nonce) - if err != nil || nres == nil || nres.Count == 0 || len(nres.Rows) == 0 || len(nres.Rows[0]) == 0 { - writeError(w, http.StatusBadRequest, "invalid or expired nonce") - return - } - nonceID := nres.Rows[0][0] - - // EVM personal_sign verification of the nonce - msg := []byte(req.Nonce) - prefix := []byte("\x19Ethereum Signed Message:\n" + strconv.Itoa(len(msg))) - hash := ethcrypto.Keccak256(prefix, msg) - - // Decode signature (expects 65-byte r||s||v, hex with optional 0x) - sigHex := strings.TrimSpace(req.Signature) - if strings.HasPrefix(sigHex, "0x") || strings.HasPrefix(sigHex, "0X") { - sigHex = sigHex[2:] - } - sig, err := hex.DecodeString(sigHex) - if err != nil || len(sig) != 65 { - writeError(w, http.StatusBadRequest, "invalid signature format") - return - } - // Normalize V to 0/1 as expected by geth - if sig[64] >= 27 { - sig[64] -= 27 - } - pub, err := ethcrypto.SigToPub(hash, sig) - if err != nil { - writeError(w, http.StatusUnauthorized, "signature recovery failed") - return - } - addr := ethcrypto.PubkeyToAddress(*pub).Hex() - want := strings.ToLower(strings.TrimPrefix(strings.TrimPrefix(req.Wallet, "0x"), "0X")) - got := strings.ToLower(strings.TrimPrefix(strings.TrimPrefix(addr, "0x"), "0X")) - if got != want { - writeError(w, http.StatusUnauthorized, "signature does not match wallet") - return - } - - // Mark nonce used now (after successful verification) - if _, err := db.Query(internalCtx, "UPDATE nonces SET used_at = datetime('now') WHERE id = ?", nonceID); err != nil { - writeError(w, http.StatusInternalServerError, err.Error()) - return - } - - // Derive public key (uncompressed) hex - pubBytes := ethcrypto.FromECDSAPub(pub) - pubHex := "0x" + hex.EncodeToString(pubBytes) - - // Generate client app_id - buf := make([]byte, 12) - if _, err := rand.Read(buf); err != nil { - writeError(w, http.StatusInternalServerError, "failed to generate app id") - return - } - appID := "app_" + base64.RawURLEncoding.EncodeToString(buf) - - // Persist app - if _, err := db.Query(internalCtx, "INSERT INTO apps(namespace_id, app_id, name, public_key) VALUES (?, ?, ?, ?)", nsID, appID, req.Name, pubHex); err != nil { - writeError(w, http.StatusInternalServerError, err.Error()) - return - } - - // Record namespace ownership by wallet (best-effort) - _, _ = db.Query(internalCtx, "INSERT OR IGNORE INTO namespace_ownership(namespace_id, owner_type, owner_id) VALUES (?, ?, ?)", nsID, "wallet", req.Wallet) writeJSON(w, http.StatusCreated, map[string]any{ "client_id": appID, "app": map[string]any{ - "app_id": appID, - "name": req.Name, - "public_key": pubHex, - "namespace": ns, - "wallet": strings.ToLower(strings.TrimPrefix(strings.TrimPrefix(req.Wallet, "0x"), "0X")), + "app_id": appID, + "name": req.Name, + "namespace": req.Namespace, + "wallet": strings.ToLower(req.Wallet), }, "signature_verified": true, }) } func (g *Gateway) refreshHandler(w http.ResponseWriter, r *http.Request) { - if g.client == nil { - writeError(w, http.StatusServiceUnavailable, "client not initialized") + if g.authService == nil { + writeError(w, http.StatusServiceUnavailable, "auth service not initialized") return } if r.Method != http.MethodPost { @@ -690,54 +341,20 @@ func (g *Gateway) refreshHandler(w http.ResponseWriter, r *http.Request) { writeError(w, http.StatusBadRequest, "refresh_token is required") return } - ns := strings.TrimSpace(req.Namespace) - if ns == "" { - ns = strings.TrimSpace(g.cfg.ClientNamespace) - if ns == "" { - ns = "default" - } - } - ctx := r.Context() - // Use internal context to bypass authentication for system operations - internalCtx := client.WithInternalAuth(ctx) - db := g.client.Database() - nsID, err := g.resolveNamespaceID(ctx, ns) + + token, subject, expUnix, err := g.authService.RefreshToken(r.Context(), req.RefreshToken, req.Namespace) if err != nil { - writeError(w, http.StatusInternalServerError, err.Error()) - return - } - q := "SELECT subject FROM refresh_tokens WHERE namespace_id = ? AND token = ? AND revoked_at IS NULL AND (expires_at IS NULL OR expires_at > datetime('now')) LIMIT 1" - rres, err := db.Query(internalCtx, q, nsID, req.RefreshToken) - if err != nil || rres == nil || rres.Count == 0 { - writeError(w, http.StatusUnauthorized, "invalid or expired refresh token") - return - } - subject := "" - if len(rres.Rows) > 0 && len(rres.Rows[0]) > 0 { - if s, ok := rres.Rows[0][0].(string); ok { - subject = s - } else { - // fallback: format via json - b, _ := json.Marshal(rres.Rows[0][0]) - _ = json.Unmarshal(b, &subject) - } - } - if g.signingKey == nil { - writeError(w, http.StatusServiceUnavailable, "signing key unavailable") - return - } - token, expUnix, err := g.generateJWT(ns, subject, 15*time.Minute) - if err != nil { - writeError(w, http.StatusInternalServerError, err.Error()) + writeError(w, http.StatusUnauthorized, err.Error()) return } + writeJSON(w, http.StatusOK, map[string]any{ "access_token": token, "token_type": "Bearer", "expires_in": int(expUnix - time.Now().Unix()), "refresh_token": req.RefreshToken, "subject": subject, - "namespace": ns, + "namespace": req.Namespace, }) } @@ -1064,8 +681,8 @@ func (g *Gateway) loginPageHandler(w http.ResponseWriter, r *http.Request) { // be revoked. If all=true is provided (and the request is authenticated via JWT), // all tokens for the JWT subject within the namespace are revoked. func (g *Gateway) logoutHandler(w http.ResponseWriter, r *http.Request) { - if g.client == nil { - writeError(w, http.StatusServiceUnavailable, "client not initialized") + if g.authService == nil { + writeError(w, http.StatusServiceUnavailable, "auth service not initialized") return } if r.Method != http.MethodPost { @@ -1081,38 +698,12 @@ func (g *Gateway) logoutHandler(w http.ResponseWriter, r *http.Request) { writeError(w, http.StatusBadRequest, "invalid json body") return } - ns := strings.TrimSpace(req.Namespace) - if ns == "" { - ns = strings.TrimSpace(g.cfg.ClientNamespace) - if ns == "" { - ns = "default" - } - } + ctx := r.Context() - // Use internal context to bypass authentication for system operations - internalCtx := client.WithInternalAuth(ctx) - db := g.client.Database() - nsID, err := g.resolveNamespaceID(ctx, ns) - if err != nil { - writeError(w, http.StatusInternalServerError, err.Error()) - return - } - - if strings.TrimSpace(req.RefreshToken) != "" { - // Revoke specific token - if _, err := db.Query(internalCtx, "UPDATE refresh_tokens SET revoked_at = datetime('now') WHERE namespace_id = ? AND token = ? AND revoked_at IS NULL", nsID, req.RefreshToken); err != nil { - writeError(w, http.StatusInternalServerError, err.Error()) - return - } - writeJSON(w, http.StatusOK, map[string]any{"status": "ok", "revoked": 1}) - return - } - + var subject string if req.All { - // Require JWT to identify subject - var subject string if v := ctx.Value(ctxKeyJWT); v != nil { - if claims, ok := v.(*jwtClaims); ok && claims != nil { + if claims, ok := v.(*auth.JWTClaims); ok && claims != nil { subject = strings.TrimSpace(claims.Sub) } } @@ -1120,23 +711,19 @@ func (g *Gateway) logoutHandler(w http.ResponseWriter, r *http.Request) { writeError(w, http.StatusUnauthorized, "jwt required for all=true") return } - if _, err := db.Query(internalCtx, "UPDATE refresh_tokens SET revoked_at = datetime('now') WHERE namespace_id = ? AND subject = ? AND revoked_at IS NULL", nsID, subject); err != nil { - writeError(w, http.StatusInternalServerError, err.Error()) - return - } - writeJSON(w, http.StatusOK, map[string]any{"status": "ok", "revoked": "all"}) + } + + if err := g.authService.RevokeToken(ctx, req.Namespace, req.RefreshToken, req.All, subject); err != nil { + writeError(w, http.StatusInternalServerError, err.Error()) return } - writeError(w, http.StatusBadRequest, "nothing to revoke: provide refresh_token or all=true") + writeJSON(w, http.StatusOK, map[string]any{"status": "ok"}) } -// simpleAPIKeyHandler creates an API key directly from a wallet address without signature verification -// This is a simplified flow for development/testing -// Requires: POST { wallet, namespace } func (g *Gateway) simpleAPIKeyHandler(w http.ResponseWriter, r *http.Request) { - if g.client == nil { - writeError(w, http.StatusServiceUnavailable, "client not initialized") + if g.authService == nil { + writeError(w, http.StatusServiceUnavailable, "auth service not initialized") return } if r.Method != http.MethodPost { @@ -1159,114 +746,16 @@ func (g *Gateway) simpleAPIKeyHandler(w http.ResponseWriter, r *http.Request) { return } - ns := strings.TrimSpace(req.Namespace) - if ns == "" { - ns = strings.TrimSpace(g.cfg.ClientNamespace) - if ns == "" { - ns = "default" - } - } - - ctx := r.Context() - internalCtx := client.WithInternalAuth(ctx) - db := g.client.Database() - - // Resolve or create namespace - if _, err := db.Query(internalCtx, "INSERT OR IGNORE INTO namespaces(name) VALUES (?)", ns); err != nil { + apiKey, err := g.authService.GetOrCreateAPIKey(r.Context(), req.Wallet, req.Namespace) + if err != nil { writeError(w, http.StatusInternalServerError, err.Error()) return } - nres, err := db.Query(internalCtx, "SELECT id FROM namespaces WHERE name = ? LIMIT 1", ns) - if err != nil || nres == nil || nres.Count == 0 || len(nres.Rows) == 0 || len(nres.Rows[0]) == 0 { - writeError(w, http.StatusInternalServerError, "failed to resolve namespace") - return - } - nsID := nres.Rows[0][0] - - // Check if api key already exists for (namespace, wallet) - var apiKey string - r1, err := db.Query(internalCtx, - "SELECT api_keys.key FROM wallet_api_keys JOIN api_keys ON wallet_api_keys.api_key_id = api_keys.id WHERE wallet_api_keys.namespace_id = ? AND LOWER(wallet_api_keys.wallet) = LOWER(?) LIMIT 1", - nsID, req.Wallet, - ) - if err == nil && r1 != nil && r1.Count > 0 && len(r1.Rows) > 0 && len(r1.Rows[0]) > 0 { - if s, ok := r1.Rows[0][0].(string); ok { - apiKey = s - } else { - b, _ := json.Marshal(r1.Rows[0][0]) - _ = json.Unmarshal(b, &apiKey) - } - } - - // If no existing key, create a new one - if strings.TrimSpace(apiKey) == "" { - buf := make([]byte, 18) - if _, err := rand.Read(buf); err != nil { - writeError(w, http.StatusInternalServerError, "failed to generate api key") - return - } - apiKey = "ak_" + base64.RawURLEncoding.EncodeToString(buf) + ":" + ns - - if _, err := db.Query(internalCtx, "INSERT INTO api_keys(key, name, namespace_id) VALUES (?, ?, ?)", apiKey, "", nsID); err != nil { - writeError(w, http.StatusInternalServerError, err.Error()) - return - } - - // Link wallet to api key - rid, err := db.Query(internalCtx, "SELECT id FROM api_keys WHERE key = ? LIMIT 1", apiKey) - if err == nil && rid != nil && rid.Count > 0 && len(rid.Rows) > 0 && len(rid.Rows[0]) > 0 { - apiKeyID := rid.Rows[0][0] - _, _ = db.Query(internalCtx, "INSERT OR IGNORE INTO wallet_api_keys(namespace_id, wallet, api_key_id) VALUES (?, ?, ?)", nsID, strings.ToLower(req.Wallet), apiKeyID) - } - } - - // Record ownerships (best-effort) - _, _ = db.Query(internalCtx, "INSERT OR IGNORE INTO namespace_ownership(namespace_id, owner_type, owner_id) VALUES (?, 'api_key', ?)", nsID, apiKey) - _, _ = db.Query(internalCtx, "INSERT OR IGNORE INTO namespace_ownership(namespace_id, owner_type, owner_id) VALUES (?, 'wallet', ?)", nsID, req.Wallet) - writeJSON(w, http.StatusOK, map[string]any{ "api_key": apiKey, - "namespace": ns, + "namespace": req.Namespace, "wallet": strings.ToLower(strings.TrimPrefix(strings.TrimPrefix(req.Wallet, "0x"), "0X")), "created": time.Now().Format(time.RFC3339), }) } - -// base58Decode decodes a base58-encoded string (Bitcoin alphabet) -// Used for decoding Solana public keys (base58-encoded 32-byte ed25519 public keys) -func base58Decode(encoded string) ([]byte, error) { - const alphabet = "123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz" - - // Build reverse lookup map - lookup := make(map[rune]int) - for i, c := range alphabet { - lookup[c] = i - } - - // Convert to big integer - num := big.NewInt(0) - base := big.NewInt(58) - - for _, c := range encoded { - val, ok := lookup[c] - if !ok { - return nil, fmt.Errorf("invalid base58 character: %c", c) - } - num.Mul(num, base) - num.Add(num, big.NewInt(int64(val))) - } - - // Convert to bytes - decoded := num.Bytes() - - // Add leading zeros for each leading '1' in the input - for _, c := range encoded { - if c != '1' { - break - } - decoded = append([]byte{0}, decoded...) - } - - return decoded, nil -} diff --git a/pkg/gateway/gateway.go b/pkg/gateway/gateway.go index 08894da..e644f85 100644 --- a/pkg/gateway/gateway.go +++ b/pkg/gateway/gateway.go @@ -4,12 +4,13 @@ import ( "context" "crypto/rand" "crypto/rsa" + "crypto/x509" "database/sql" + "encoding/pem" "fmt" "net" "os" "path/filepath" - "strconv" "strings" "sync" "time" @@ -21,6 +22,7 @@ import ( "github.com/DeBrosOfficial/network/pkg/olric" "github.com/DeBrosOfficial/network/pkg/rqlite" "github.com/DeBrosOfficial/network/pkg/serverless" + "github.com/DeBrosOfficial/network/pkg/gateway/auth" "github.com/multiformats/go-multiaddr" olriclib "github.com/olric-data/olric" "go.uber.org/zap" @@ -68,8 +70,6 @@ type Gateway struct { client client.NetworkClient nodePeerID string // The node's actual peer ID from its identity file (overrides client's peer ID) startedAt time.Time - signingKey *rsa.PrivateKey - keyID string // rqlite SQL connection and HTTP ORM gateway sqlDB *sql.DB @@ -93,6 +93,9 @@ type Gateway struct { serverlessInvoker *serverless.Invoker serverlessWSMgr *serverless.WSManager serverlessHandlers *ServerlessHandlers + + // Authentication service + authService *auth.Service } // localSubscriber represents a WebSocket subscriber for local message delivery @@ -139,16 +142,6 @@ func New(logger *logging.ColoredLogger, cfg *Config) (*Gateway, error) { localSubscribers: make(map[string][]*localSubscriber), } - logger.ComponentInfo(logging.ComponentGeneral, "Generating RSA signing key...") - // Generate local RSA signing key for JWKS/JWT (ephemeral for now) - if key, err := rsa.GenerateKey(rand.Reader, 2048); err == nil { - gw.signingKey = key - gw.keyID = "gw-" + strconv.FormatInt(time.Now().Unix(), 10) - logger.ComponentInfo(logging.ComponentGeneral, "RSA key generated successfully") - } else { - logger.ComponentWarn(logging.ComponentGeneral, "failed to generate RSA key; jwks will be empty", zap.Error(err)) - } - logger.ComponentInfo(logging.ComponentGeneral, "Initializing RQLite ORM HTTP gateway...") dsn := cfg.RQLiteDSN if dsn == "" { @@ -362,14 +355,28 @@ func New(logger *logging.ColoredLogger, cfg *Config) (*Gateway, error) { gw.serverlessInvoker = serverless.NewInvoker(engine, registry, hostFuncs, logger.Logger) // Create HTTP handlers - gw.serverlessHandlers = NewServerlessHandlers( - gw.serverlessInvoker, - registry, - gw.serverlessWSMgr, - logger.Logger, - ) + gw.serverlessHandlers = NewServerlessHandlers( + gw.serverlessInvoker, + registry, + gw.serverlessWSMgr, + logger.Logger, + ) - logger.ComponentInfo(logging.ComponentGeneral, "Serverless function engine ready", + // Initialize auth service + // For now using ephemeral key, can be loaded from config later + key, _ := rsa.GenerateKey(rand.Reader, 2048) + keyPEM := pem.EncodeToMemory(&pem.Block{ + Type: "RSA PRIVATE KEY", + Bytes: x509.MarshalPKCS1PrivateKey(key), + }) + authService, err := auth.NewService(logger, c, string(keyPEM), cfg.ClientNamespace) + if err != nil { + logger.ComponentError(logging.ComponentGeneral, "failed to initialize auth service", zap.Error(err)) + } else { + gw.authService = authService + } + + logger.ComponentInfo(logging.ComponentGeneral, "Serverless function engine ready", zap.Int("default_memory_mb", engineCfg.DefaultMemoryLimitMB), zap.Int("default_timeout_sec", engineCfg.DefaultTimeoutSeconds), zap.Int("module_cache_size", engineCfg.ModuleCacheSize), diff --git a/pkg/gateway/jwt_test.go b/pkg/gateway/jwt_test.go index c8c73c4..53b6278 100644 --- a/pkg/gateway/jwt_test.go +++ b/pkg/gateway/jwt_test.go @@ -3,22 +3,32 @@ package gateway import ( "crypto/rand" "crypto/rsa" + "crypto/x509" + "encoding/pem" "testing" "time" + + "github.com/DeBrosOfficial/network/pkg/gateway/auth" ) func TestJWTGenerateAndParse(t *testing.T) { - gw := &Gateway{} key, _ := rsa.GenerateKey(rand.Reader, 2048) - gw.signingKey = key - gw.keyID = "kid" + keyPEM := pem.EncodeToMemory(&pem.Block{ + Type: "RSA PRIVATE KEY", + Bytes: x509.MarshalPKCS1PrivateKey(key), + }) - tok, exp, err := gw.generateJWT("ns1", "subj", time.Minute) + svc, err := auth.NewService(nil, nil, string(keyPEM), "default") + if err != nil { + t.Fatalf("failed to create service: %v", err) + } + + tok, exp, err := svc.GenerateJWT("ns1", "subj", time.Minute) if err != nil || exp <= 0 { t.Fatalf("gen err=%v exp=%d", err, exp) } - claims, err := gw.parseAndVerifyJWT(tok) + claims, err := svc.ParseAndVerifyJWT(tok) if err != nil { t.Fatalf("verify err: %v", err) } @@ -28,17 +38,23 @@ func TestJWTGenerateAndParse(t *testing.T) { } func TestJWTExpired(t *testing.T) { - gw := &Gateway{} key, _ := rsa.GenerateKey(rand.Reader, 2048) - gw.signingKey = key - gw.keyID = "kid" + keyPEM := pem.EncodeToMemory(&pem.Block{ + Type: "RSA PRIVATE KEY", + Bytes: x509.MarshalPKCS1PrivateKey(key), + }) + + svc, err := auth.NewService(nil, nil, string(keyPEM), "default") + if err != nil { + t.Fatalf("failed to create service: %v", err) + } // Use sufficiently negative TTL to bypass allowed clock skew - tok, _, err := gw.generateJWT("ns1", "subj", -2*time.Minute) + tok, _, err := svc.GenerateJWT("ns1", "subj", -2*time.Minute) if err != nil { t.Fatalf("gen err=%v", err) } - if _, err := gw.parseAndVerifyJWT(tok); err == nil { + if _, err := svc.ParseAndVerifyJWT(tok); err == nil { t.Fatalf("expected expired error") } } diff --git a/pkg/gateway/middleware.go b/pkg/gateway/middleware.go index 6d74564..1cd3075 100644 --- a/pkg/gateway/middleware.go +++ b/pkg/gateway/middleware.go @@ -10,6 +10,7 @@ import ( "time" "github.com/DeBrosOfficial/network/pkg/client" + "github.com/DeBrosOfficial/network/pkg/gateway/auth" "github.com/DeBrosOfficial/network/pkg/logging" "go.uber.org/zap" ) @@ -74,7 +75,7 @@ func (g *Gateway) authMiddleware(next http.Handler) http.Handler { if strings.HasPrefix(lower, "bearer ") { tok := strings.TrimSpace(auth[len("Bearer "):]) if strings.Count(tok, ".") == 2 { - if claims, err := g.parseAndVerifyJWT(tok); err == nil { + if claims, err := g.authService.ParseAndVerifyJWT(tok); err == nil { // Attach JWT claims and namespace to context ctx := context.WithValue(r.Context(), ctxKeyJWT, claims) if ns := strings.TrimSpace(claims.Namespace); ns != "" { @@ -235,7 +236,7 @@ func (g *Gateway) authorizationMiddleware(next http.Handler) http.Handler { apiKeyFallback := "" if v := ctx.Value(ctxKeyJWT); v != nil { - if claims, ok := v.(*jwtClaims); ok && claims != nil && strings.TrimSpace(claims.Sub) != "" { + if claims, ok := v.(*auth.JWTClaims); ok && claims != nil && strings.TrimSpace(claims.Sub) != "" { // Determine subject type. // If subject looks like an API key (e.g., ak_:), // treat it as an API key owner; otherwise assume a wallet subject. diff --git a/pkg/gateway/routes.go b/pkg/gateway/routes.go index 9314812..6e2a22b 100644 --- a/pkg/gateway/routes.go +++ b/pkg/gateway/routes.go @@ -14,8 +14,8 @@ func (g *Gateway) Routes() http.Handler { mux.HandleFunc("/v1/status", g.statusHandler) // auth endpoints - mux.HandleFunc("/v1/auth/jwks", g.jwksHandler) - mux.HandleFunc("/.well-known/jwks.json", g.jwksHandler) + mux.HandleFunc("/v1/auth/jwks", g.authService.JWKSHandler) + mux.HandleFunc("/.well-known/jwks.json", g.authService.JWKSHandler) mux.HandleFunc("/v1/auth/login", g.loginPageHandler) mux.HandleFunc("/v1/auth/challenge", g.challengeHandler) mux.HandleFunc("/v1/auth/verify", g.verifyHandler) diff --git a/pkg/installer/installer.go b/pkg/installer/installer.go index a545c90..80ab454 100644 --- a/pkg/installer/installer.go +++ b/pkg/installer/installer.go @@ -17,6 +17,7 @@ import ( "github.com/charmbracelet/lipgloss" "github.com/DeBrosOfficial/network/pkg/certutil" + "github.com/DeBrosOfficial/network/pkg/config" "github.com/DeBrosOfficial/network/pkg/tlsutil" ) @@ -338,7 +339,7 @@ func (m *Model) handleEnter() (tea.Model, tea.Cmd) { case StepSwarmKey: swarmKey := strings.TrimSpace(m.textInput.Value()) - if err := validateSwarmKey(swarmKey); err != nil { + if err := config.ValidateSwarmKey(swarmKey); err != nil { m.err = err return m, nil } @@ -816,17 +817,6 @@ func validateClusterSecret(secret string) error { return nil } -func validateSwarmKey(key string) error { - if len(key) != 64 { - return fmt.Errorf("swarm key must be 64 hex characters") - } - keyRegex := regexp.MustCompile(`^[a-fA-F0-9]{64}$`) - if !keyRegex.MatchString(key) { - return fmt.Errorf("swarm key must be valid hexadecimal") - } - return nil -} - // ensureCertificatesForDomain generates self-signed certificates for the domain func ensureCertificatesForDomain(domain string) error { // Get home directory diff --git a/pkg/ipfs/cluster.go b/pkg/ipfs/cluster.go index a203a58..17089a9 100644 --- a/pkg/ipfs/cluster.go +++ b/pkg/ipfs/cluster.go @@ -1,27 +1,16 @@ package ipfs import ( - "bytes" - "crypto/rand" - "encoding/hex" - "encoding/json" "fmt" - "io" - "net" "net/http" - "net/url" "os" "os/exec" "path/filepath" "strings" "time" - "go.uber.org/zap" - "github.com/DeBrosOfficial/network/pkg/config" - "github.com/DeBrosOfficial/network/pkg/tlsutil" - "github.com/libp2p/go-libp2p/core/host" - "github.com/multiformats/go-multiaddr" + "go.uber.org/zap" ) // ClusterConfigManager manages IPFS Cluster configuration files @@ -32,51 +21,8 @@ type ClusterConfigManager struct { secret string } -// ClusterServiceConfig represents the structure of service.json -type ClusterServiceConfig struct { - Cluster struct { - Peername string `json:"peername"` - Secret string `json:"secret"` - LeaveOnShutdown bool `json:"leave_on_shutdown"` - ListenMultiaddress []string `json:"listen_multiaddress"` - PeerAddresses []string `json:"peer_addresses"` - // ... other fields kept from template - } `json:"cluster"` - Consensus struct { - CRDT struct { - ClusterName string `json:"cluster_name"` - TrustedPeers []string `json:"trusted_peers"` - Batching struct { - MaxBatchSize int `json:"max_batch_size"` - MaxBatchAge string `json:"max_batch_age"` - } `json:"batching"` - RepairInterval string `json:"repair_interval"` - } `json:"crdt"` - } `json:"consensus"` - API struct { - IPFSProxy struct { - ListenMultiaddress string `json:"listen_multiaddress"` - NodeMultiaddress string `json:"node_multiaddress"` - } `json:"ipfsproxy"` - PinSvcAPI struct { - HTTPListenMultiaddress string `json:"http_listen_multiaddress"` - } `json:"pinsvcapi"` - RestAPI struct { - HTTPListenMultiaddress string `json:"http_listen_multiaddress"` - } `json:"restapi"` - } `json:"api"` - IPFSConnector struct { - IPFSHTTP struct { - NodeMultiaddress string `json:"node_multiaddress"` - } `json:"ipfshttp"` - } `json:"ipfs_connector"` - // Keep rest of fields as raw JSON to preserve structure - Raw map[string]interface{} `json:"-"` -} - // NewClusterConfigManager creates a new IPFS Cluster config manager func NewClusterConfigManager(cfg *config.Config, logger *zap.Logger) (*ClusterConfigManager, error) { - // Expand data directory path dataDir := cfg.Node.DataDir if strings.HasPrefix(dataDir, "~") { home, err := os.UserHomeDir() @@ -86,13 +32,10 @@ func NewClusterConfigManager(cfg *config.Config, logger *zap.Logger) (*ClusterCo dataDir = filepath.Join(home, dataDir[1:]) } - // Determine cluster path based on data directory structure - // Check if dataDir contains specific node names (e.g., ~/.orama/node-1, ~/.orama/node-2, etc.) clusterPath := filepath.Join(dataDir, "ipfs-cluster") nodeNames := []string{"node-1", "node-2", "node-3", "node-4", "node-5"} for _, nodeName := range nodeNames { if strings.Contains(dataDir, nodeName) { - // Check if this is a direct child if filepath.Base(filepath.Dir(dataDir)) == nodeName || filepath.Base(dataDir) == nodeName { clusterPath = filepath.Join(dataDir, "ipfs-cluster") } else { @@ -102,15 +45,11 @@ func NewClusterConfigManager(cfg *config.Config, logger *zap.Logger) (*ClusterCo } } - // Load or generate cluster secret - // Always use ~/.orama/secrets/cluster-secret (new standard location) secretPath := filepath.Join(dataDir, "..", "cluster-secret") if strings.Contains(dataDir, ".orama") { - // Use the secrets directory for proper file organization home, err := os.UserHomeDir() if err == nil { secretsDir := filepath.Join(home, ".orama", "secrets") - // Ensure secrets directory exists if err := os.MkdirAll(secretsDir, 0700); err == nil { secretPath = filepath.Join(secretsDir, "cluster-secret") } @@ -133,25 +72,21 @@ func NewClusterConfigManager(cfg *config.Config, logger *zap.Logger) (*ClusterCo // EnsureConfig ensures the IPFS Cluster service.json exists and is properly configured func (cm *ClusterConfigManager) EnsureConfig() error { if cm.cfg.Database.IPFS.ClusterAPIURL == "" { - cm.logger.Debug("IPFS Cluster API URL not configured, skipping cluster config") return nil } serviceJSONPath := filepath.Join(cm.clusterPath, "service.json") - - // Parse ports from URLs clusterPort, restAPIPort, err := parseClusterPorts(cm.cfg.Database.IPFS.ClusterAPIURL) if err != nil { - return fmt.Errorf("failed to parse cluster API URL: %w", err) + return err } ipfsPort, err := parseIPFSPort(cm.cfg.Database.IPFS.APIURL) if err != nil { - return fmt.Errorf("failed to parse IPFS API URL: %w", err) + return err } - // Determine node name from ID or DataDir - nodeName := "node-1" // Default fallback + nodeName := "node-1" possibleNames := []string{"node-1", "node-2", "node-3", "node-4", "node-5"} for _, name := range possibleNames { if strings.Contains(cm.cfg.Node.DataDir, name) || strings.Contains(cm.cfg.Node.ID, name) { @@ -159,1064 +94,54 @@ func (cm *ClusterConfigManager) EnsureConfig() error { break } } - // If ID contains a node identifier, use it - if cm.cfg.Node.ID != "" { - for _, name := range possibleNames { - if strings.Contains(cm.cfg.Node.ID, name) { - nodeName = name - break - } - } - } - // Calculate ports based on pattern - // REST API: 9094 - // Proxy: 9094 - 1 = 9093 (NOT USED - keeping for reference) - // PinSvc: 9094 + 1 = 9095 - // Proxy API: 9094 + 1 = 9095 (actual proxy port) - // PinSvc API: 9094 + 3 = 9097 - // Cluster LibP2P: 9094 + 4 = 9098 - proxyPort := clusterPort + 1 // 9095 (IPFSProxy API) - pinSvcPort := clusterPort + 3 // 9097 (PinSvc API) - clusterListenPort := clusterPort + 4 // 9098 (Cluster LibP2P) + proxyPort := clusterPort + 1 + pinSvcPort := clusterPort + 3 + clusterListenPort := clusterPort + 4 - // If config doesn't exist, initialize it with ipfs-cluster-service init - // This ensures we have all required sections (datastore, informer, etc.) if _, err := os.Stat(serviceJSONPath); os.IsNotExist(err) { - cm.logger.Info("Initializing cluster config with ipfs-cluster-service init") initCmd := exec.Command("ipfs-cluster-service", "init", "--force") initCmd.Env = append(os.Environ(), "IPFS_CLUSTER_PATH="+cm.clusterPath) - if err := initCmd.Run(); err != nil { - cm.logger.Warn("Failed to initialize cluster config with ipfs-cluster-service init, will create minimal template", zap.Error(err)) - } + _ = initCmd.Run() } - // Load existing config or create new cfg, err := cm.loadOrCreateConfig(serviceJSONPath) if err != nil { - return fmt.Errorf("failed to load/create config: %w", err) + return err } - // Update configuration cfg.Cluster.Peername = nodeName cfg.Cluster.Secret = cm.secret cfg.Cluster.ListenMultiaddress = []string{fmt.Sprintf("/ip4/0.0.0.0/tcp/%d", clusterListenPort)} cfg.Consensus.CRDT.ClusterName = "debros-cluster" cfg.Consensus.CRDT.TrustedPeers = []string{"*"} - - // API endpoints cfg.API.RestAPI.HTTPListenMultiaddress = fmt.Sprintf("/ip4/0.0.0.0/tcp/%d", restAPIPort) cfg.API.IPFSProxy.ListenMultiaddress = fmt.Sprintf("/ip4/127.0.0.1/tcp/%d", proxyPort) - cfg.API.IPFSProxy.NodeMultiaddress = fmt.Sprintf("/ip4/127.0.0.1/tcp/%d", ipfsPort) // FIX: Correct path! + cfg.API.IPFSProxy.NodeMultiaddress = fmt.Sprintf("/ip4/127.0.0.1/tcp/%d", ipfsPort) cfg.API.PinSvcAPI.HTTPListenMultiaddress = fmt.Sprintf("/ip4/127.0.0.1/tcp/%d", pinSvcPort) - - // IPFS connector (also needs to be set) cfg.IPFSConnector.IPFSHTTP.NodeMultiaddress = fmt.Sprintf("/ip4/127.0.0.1/tcp/%d", ipfsPort) - // Save configuration - if err := cm.saveConfig(serviceJSONPath, cfg); err != nil { - return fmt.Errorf("failed to save config: %w", err) - } - - cm.logger.Info("IPFS Cluster configuration ensured", - zap.String("path", serviceJSONPath), - zap.String("node_name", nodeName), - zap.Int("ipfs_port", ipfsPort), - zap.Int("cluster_port", clusterPort), - zap.Int("rest_api_port", restAPIPort)) - - return nil + return cm.saveConfig(serviceJSONPath, cfg) } -// UpdatePeerAddresses updates peer_addresses and peerstore with peer information -// Returns true if update was successful, false if peer is not available yet (non-fatal) -func (cm *ClusterConfigManager) UpdatePeerAddresses(peerAPIURL string) (bool, error) { - if cm.cfg.Database.IPFS.ClusterAPIURL == "" { - return false, nil // IPFS not configured - } - - // Skip if this is the first node (creates the cluster, no join address) - if cm.cfg.Database.RQLiteJoinAddress == "" { - return false, nil - } - - // Query peer cluster API to get peer ID - peerID, err := getPeerID(peerAPIURL) - if err != nil { - // Non-fatal: peer might not be available yet - cm.logger.Debug("Peer not available yet, will retry", - zap.String("peer_api", peerAPIURL), - zap.Error(err)) - return false, nil - } - - if peerID == "" { - cm.logger.Debug("Peer ID not available yet") - return false, nil - } - - // Extract peer host and cluster port from URL - peerHost, clusterPort, err := parsePeerHostAndPort(peerAPIURL) - if err != nil { - return false, fmt.Errorf("failed to parse peer cluster API URL: %w", err) - } - - // Peer cluster LibP2P listens on clusterPort + 4 - // (REST API is 9094, LibP2P is 9098 = 9094 + 4) - peerClusterPort := clusterPort + 4 - - // Determine IP protocol (ip4 or ip6) based on the host - var ipProtocol string - if net.ParseIP(peerHost).To4() != nil { - ipProtocol = "ip4" - } else { - ipProtocol = "ip6" - } - - peerAddr := fmt.Sprintf("/%s/%s/tcp/%d/p2p/%s", ipProtocol, peerHost, peerClusterPort, peerID) - - // Load current config - serviceJSONPath := filepath.Join(cm.clusterPath, "service.json") - cfg, err := cm.loadOrCreateConfig(serviceJSONPath) - if err != nil { - return false, fmt.Errorf("failed to load config: %w", err) - } - - // CRITICAL: Always update peerstore file to ensure no stale addresses remain - // Stale addresses (e.g., from old port configurations) cause LibP2P dial backoff, - // preventing cluster peers from connecting even if the correct address is present. - // We must clean and rewrite the peerstore on every update to avoid this. - peerstorePath := filepath.Join(cm.clusterPath, "peerstore") - - // Check if peerstore needs updating (avoid unnecessary writes but always clean stale entries) - needsUpdate := true - if peerstoreData, err := os.ReadFile(peerstorePath); err == nil { - // Only skip update if peerstore contains EXACTLY the correct address and nothing else - existingAddrs := strings.Split(strings.TrimSpace(string(peerstoreData)), "\n") - if len(existingAddrs) == 1 && strings.TrimSpace(existingAddrs[0]) == peerAddr { - cm.logger.Debug("Peer address already correct in peerstore", zap.String("addr", peerAddr)) - needsUpdate = false - } - } - - if needsUpdate { - // Write ONLY the correct peer address, removing any stale entries - if err := os.WriteFile(peerstorePath, []byte(peerAddr+"\n"), 0644); err != nil { - return false, fmt.Errorf("failed to write peerstore: %w", err) - } - cm.logger.Info("Updated peerstore with peer (cleaned stale entries)", - zap.String("addr", peerAddr), - zap.String("peerstore_path", peerstorePath)) - } - - // Then sync service.json from peerstore to keep them in sync - cfg.Cluster.PeerAddresses = []string{peerAddr} - - // Save config - if err := cm.saveConfig(serviceJSONPath, cfg); err != nil { - return false, fmt.Errorf("failed to save config: %w", err) - } - - cm.logger.Info("Updated peer configuration", - zap.String("peer_addr", peerAddr), - zap.String("peerstore_path", peerstorePath)) - - return true, nil -} - -// UpdateAllClusterPeers discovers all cluster peers from the local cluster API -// and updates peer_addresses in service.json. This allows IPFS Cluster to automatically -// connect to all discovered peers in the cluster. -// Returns true if update was successful, false if cluster is not available yet (non-fatal) -func (cm *ClusterConfigManager) UpdateAllClusterPeers() (bool, error) { - if cm.cfg.Database.IPFS.ClusterAPIURL == "" { - return false, nil // IPFS not configured - } - - // Query local cluster API to get all peers - client := newStandardHTTPClient() - peersURL := fmt.Sprintf("%s/peers", cm.cfg.Database.IPFS.ClusterAPIURL) - resp, err := client.Get(peersURL) - if err != nil { - // Non-fatal: cluster might not be available yet - cm.logger.Debug("Cluster API not available yet, will retry", - zap.String("peers_url", peersURL), - zap.Error(err)) - return false, nil - } - - // Parse NDJSON response - dec := json.NewDecoder(bytes.NewReader(resp)) - var allPeerAddresses []string - seenPeers := make(map[string]bool) - peerIDToAddresses := make(map[string][]string) - - // First pass: collect all peer IDs and their addresses - for { - var peerInfo struct { - ID string `json:"id"` - Addresses []string `json:"addresses"` - ClusterPeers []string `json:"cluster_peers"` - ClusterPeersAddresses []string `json:"cluster_peers_addresses"` - } - - err := dec.Decode(&peerInfo) - if err != nil { - if err == io.EOF { - break - } - cm.logger.Debug("Failed to decode peer info", zap.Error(err)) - continue - } - - // Store this peer's addresses - if peerInfo.ID != "" { - peerIDToAddresses[peerInfo.ID] = peerInfo.Addresses - } - - // Also collect cluster peers addresses if available - // These are addresses of all peers in the cluster - for _, addr := range peerInfo.ClusterPeersAddresses { - if ma, err := multiaddr.NewMultiaddr(addr); err == nil { - // Validate it has p2p component (peer ID) - if _, err := ma.ValueForProtocol(multiaddr.P_P2P); err == nil { - addrStr := ma.String() - if !seenPeers[addrStr] { - allPeerAddresses = append(allPeerAddresses, addrStr) - seenPeers[addrStr] = true - } - } - } - } - } - - // If we didn't get cluster_peers_addresses, try to construct them from peer IDs and addresses - if len(allPeerAddresses) == 0 && len(peerIDToAddresses) > 0 { - // Get cluster listen port from config - serviceJSONPath := filepath.Join(cm.clusterPath, "service.json") - cfg, err := cm.loadOrCreateConfig(serviceJSONPath) - if err == nil && len(cfg.Cluster.ListenMultiaddress) > 0 { - // Extract port from listen_multiaddress (e.g., "/ip4/0.0.0.0/tcp/9098") - listenAddr := cfg.Cluster.ListenMultiaddress[0] - if ma, err := multiaddr.NewMultiaddr(listenAddr); err == nil { - if port, err := ma.ValueForProtocol(multiaddr.P_TCP); err == nil { - // For each peer ID, try to find its IP address and construct cluster multiaddr - for peerID, addresses := range peerIDToAddresses { - // Try to find an IP address in the peer's addresses - for _, addrStr := range addresses { - if ma, err := multiaddr.NewMultiaddr(addrStr); err == nil { - // Extract IP address (IPv4 or IPv6) - if ip, err := ma.ValueForProtocol(multiaddr.P_IP4); err == nil && ip != "" { - clusterAddr := fmt.Sprintf("/ip4/%s/tcp/%s/p2p/%s", ip, port, peerID) - if !seenPeers[clusterAddr] { - allPeerAddresses = append(allPeerAddresses, clusterAddr) - seenPeers[clusterAddr] = true - } - break - } else if ip, err := ma.ValueForProtocol(multiaddr.P_IP6); err == nil && ip != "" { - clusterAddr := fmt.Sprintf("/ip6/%s/tcp/%s/p2p/%s", ip, port, peerID) - if !seenPeers[clusterAddr] { - allPeerAddresses = append(allPeerAddresses, clusterAddr) - seenPeers[clusterAddr] = true - } - break - } - } - } - } - } - } - } - } - - if len(allPeerAddresses) == 0 { - cm.logger.Debug("No cluster peer addresses found in API response") - return false, nil - } - - // Load current config - serviceJSONPath := filepath.Join(cm.clusterPath, "service.json") - cfg, err := cm.loadOrCreateConfig(serviceJSONPath) - if err != nil { - return false, fmt.Errorf("failed to load config: %w", err) - } - - // Check if peer addresses have changed - addressesChanged := false - if len(cfg.Cluster.PeerAddresses) != len(allPeerAddresses) { - addressesChanged = true - } else { - // Check if addresses are different - currentAddrs := make(map[string]bool) - for _, addr := range cfg.Cluster.PeerAddresses { - currentAddrs[addr] = true - } - for _, addr := range allPeerAddresses { - if !currentAddrs[addr] { - addressesChanged = true - break - } - } - } - - if !addressesChanged { - cm.logger.Debug("Cluster peer addresses already up to date", - zap.Int("peer_count", len(allPeerAddresses))) - return true, nil - } - - // Update peerstore file FIRST - this is what IPFS Cluster reads for bootstrapping - // Peerstore is the source of truth, service.json is just for our tracking - peerstorePath := filepath.Join(cm.clusterPath, "peerstore") - peerstoreContent := strings.Join(allPeerAddresses, "\n") + "\n" - if err := os.WriteFile(peerstorePath, []byte(peerstoreContent), 0644); err != nil { - cm.logger.Warn("Failed to update peerstore file", zap.Error(err)) - // Non-fatal, continue - } - - // Then sync service.json from peerstore to keep them in sync - cfg.Cluster.PeerAddresses = allPeerAddresses - - // Save config - if err := cm.saveConfig(serviceJSONPath, cfg); err != nil { - return false, fmt.Errorf("failed to save config: %w", err) - } - - cm.logger.Info("Updated cluster peer addresses", - zap.Int("peer_count", len(allPeerAddresses)), - zap.Strings("peer_addresses", allPeerAddresses)) - - return true, nil -} - -// RepairPeerConfiguration automatically discovers and repairs peer configuration -// Tries multiple methods: gateway /v1/network/status, config-based discovery, peer multiaddr -func (cm *ClusterConfigManager) RepairPeerConfiguration() (bool, error) { - if cm.cfg.Database.IPFS.ClusterAPIURL == "" { - return false, nil // IPFS not configured - } - - // Method 1: Try to discover cluster peers via /v1/network/status endpoint - // This is the most reliable method as it uses the HTTPS gateway - if len(cm.cfg.Discovery.BootstrapPeers) > 0 { - success, err := cm.DiscoverClusterPeersFromGateway() - if err != nil { - cm.logger.Debug("Gateway discovery failed, trying direct API", zap.Error(err)) - } else if success { - cm.logger.Info("Successfully discovered cluster peers from gateway") - return true, nil - } - } - - // Skip direct API method if this is the first node (creates the cluster, no join address) - if cm.cfg.Database.RQLiteJoinAddress == "" { - return false, nil - } - - // Method 2: Try direct cluster API (fallback) - var peerAPIURL string - - // Try to extract from peers multiaddr - if len(cm.cfg.Discovery.BootstrapPeers) > 0 { - if ip := extractIPFromMultiaddrForCluster(cm.cfg.Discovery.BootstrapPeers[0]); ip != "" { - // Default cluster API port is 9094 - peerAPIURL = fmt.Sprintf("http://%s:9094", ip) - cm.logger.Debug("Inferred peer cluster API from peer", - zap.String("peer_api", peerAPIURL)) - } - } - - // Fallback to localhost if nothing found (for local development) - if peerAPIURL == "" { - peerAPIURL = "http://localhost:9094" - cm.logger.Debug("Using localhost fallback for peer cluster API") - } - - // Try to update peers - success, err := cm.UpdatePeerAddresses(peerAPIURL) - if err != nil { - return false, err - } - - if success { - cm.logger.Info("Successfully repaired peer configuration via direct API") - return true, nil - } - - // If update failed (peer not available), return false but no error - // This allows retries later - return false, nil -} - -// DiscoverClusterPeersFromGateway queries bootstrap peers' /v1/network/status endpoint -// to discover IPFS Cluster peer information and updates the local service.json -func (cm *ClusterConfigManager) DiscoverClusterPeersFromGateway() (bool, error) { - if len(cm.cfg.Discovery.BootstrapPeers) == 0 { - cm.logger.Debug("No bootstrap peers configured, skipping gateway discovery") - return false, nil - } - - var discoveredPeers []string - seenPeers := make(map[string]bool) - - for _, peerAddr := range cm.cfg.Discovery.BootstrapPeers { - // Extract domain or IP from multiaddr - domain := extractDomainFromMultiaddr(peerAddr) - if domain == "" { - continue - } - - // Query /v1/network/status endpoint - statusURL := fmt.Sprintf("https://%s/v1/network/status", domain) - cm.logger.Debug("Querying peer network status", zap.String("url", statusURL)) - - // Use TLS-aware HTTP client (handles staging certs for *.debros.network) - client := tlsutil.NewHTTPClientForDomain(10*time.Second, domain) - resp, err := client.Get(statusURL) - if err != nil { - // Try HTTP fallback - statusURL = fmt.Sprintf("http://%s/v1/network/status", domain) - resp, err = client.Get(statusURL) - if err != nil { - cm.logger.Debug("Failed to query peer status", zap.String("domain", domain), zap.Error(err)) - continue - } - } - defer resp.Body.Close() - - if resp.StatusCode != http.StatusOK { - cm.logger.Debug("Peer returned non-OK status", zap.String("domain", domain), zap.Int("status", resp.StatusCode)) - continue - } - - // Parse response - var status struct { - IPFSCluster *struct { - PeerID string `json:"peer_id"` - Addresses []string `json:"addresses"` - } `json:"ipfs_cluster"` - } - if err := json.NewDecoder(resp.Body).Decode(&status); err != nil { - cm.logger.Debug("Failed to decode peer status", zap.String("domain", domain), zap.Error(err)) - continue - } - - if status.IPFSCluster == nil || status.IPFSCluster.PeerID == "" { - cm.logger.Debug("Peer has no IPFS Cluster info", zap.String("domain", domain)) - continue - } - - // Extract IP from domain or addresses - peerIP := extractIPFromMultiaddrForCluster(peerAddr) - if peerIP == "" { - // Try to resolve domain - ips, err := net.LookupIP(domain) - if err == nil && len(ips) > 0 { - for _, ip := range ips { - if ip.To4() != nil { - peerIP = ip.String() - break - } - } - } - } - - if peerIP == "" { - cm.logger.Debug("Could not determine peer IP", zap.String("domain", domain)) - continue - } - - // Construct cluster multiaddr - // IPFS Cluster listens on port 9098 (REST API port 9094 + 4) - clusterAddr := fmt.Sprintf("/ip4/%s/tcp/9098/p2p/%s", peerIP, status.IPFSCluster.PeerID) - if !seenPeers[clusterAddr] { - discoveredPeers = append(discoveredPeers, clusterAddr) - seenPeers[clusterAddr] = true - cm.logger.Info("Discovered cluster peer from gateway", - zap.String("domain", domain), - zap.String("peer_id", status.IPFSCluster.PeerID), - zap.String("cluster_addr", clusterAddr)) - } - } - - if len(discoveredPeers) == 0 { - cm.logger.Debug("No cluster peers discovered from gateway") - return false, nil - } - - // Load current config - serviceJSONPath := filepath.Join(cm.clusterPath, "service.json") - cfg, err := cm.loadOrCreateConfig(serviceJSONPath) - if err != nil { - return false, fmt.Errorf("failed to load config: %w", err) - } - - // Update peerstore file - peerstorePath := filepath.Join(cm.clusterPath, "peerstore") - peerstoreContent := strings.Join(discoveredPeers, "\n") + "\n" - if err := os.WriteFile(peerstorePath, []byte(peerstoreContent), 0644); err != nil { - cm.logger.Warn("Failed to update peerstore file", zap.Error(err)) - } - - // Update peer_addresses in config - cfg.Cluster.PeerAddresses = discoveredPeers - - // Save config - if err := cm.saveConfig(serviceJSONPath, cfg); err != nil { - return false, fmt.Errorf("failed to save config: %w", err) - } - - cm.logger.Info("Updated cluster peer addresses from gateway discovery", - zap.Int("peer_count", len(discoveredPeers)), - zap.Strings("peer_addresses", discoveredPeers)) - - return true, nil -} - -// extractDomainFromMultiaddr extracts domain or IP from a multiaddr string -// Handles formats like /dns4/domain/tcp/port/p2p/id or /ip4/ip/tcp/port/p2p/id -func extractDomainFromMultiaddr(multiaddrStr string) string { - ma, err := multiaddr.NewMultiaddr(multiaddrStr) - if err != nil { - return "" - } - - // Try DNS4 first (domain name) - if domain, err := ma.ValueForProtocol(multiaddr.P_DNS4); err == nil && domain != "" { - return domain - } - - // Try DNS6 - if domain, err := ma.ValueForProtocol(multiaddr.P_DNS6); err == nil && domain != "" { - return domain - } - - // Try IP4 - if ip, err := ma.ValueForProtocol(multiaddr.P_IP4); err == nil && ip != "" { - return ip - } - - // Try IP6 - if ip, err := ma.ValueForProtocol(multiaddr.P_IP6); err == nil && ip != "" { - return ip - } - - return "" -} - -// DiscoverClusterPeersFromLibP2P loads IPFS cluster peer addresses from the peerstore file. -// If peerstore is empty, it means there are no peers to connect to. -// Returns true if peers were loaded and configured, false otherwise (non-fatal) -func (cm *ClusterConfigManager) DiscoverClusterPeersFromLibP2P(host host.Host) (bool, error) { - if cm.cfg.Database.IPFS.ClusterAPIURL == "" { - return false, nil // IPFS not configured - } - - // Load peer addresses from peerstore file - peerstorePath := filepath.Join(cm.clusterPath, "peerstore") - peerstoreData, err := os.ReadFile(peerstorePath) - if err != nil { - // Peerstore file doesn't exist or can't be read - no peers to connect to - cm.logger.Debug("Peerstore file not found or empty - no cluster peers to connect to", - zap.String("peerstore_path", peerstorePath)) - return false, nil - } - - var allPeerAddresses []string - seenPeers := make(map[string]bool) - - // Parse peerstore file (one multiaddr per line) - lines := strings.Split(strings.TrimSpace(string(peerstoreData)), "\n") - for _, line := range lines { - line = strings.TrimSpace(line) - if line != "" && strings.HasPrefix(line, "/") { - // Validate it's a proper multiaddr with p2p component - if ma, err := multiaddr.NewMultiaddr(line); err == nil { - if _, err := ma.ValueForProtocol(multiaddr.P_P2P); err == nil { - if !seenPeers[line] { - allPeerAddresses = append(allPeerAddresses, line) - seenPeers[line] = true - cm.logger.Debug("Loaded cluster peer address from peerstore", - zap.String("addr", line)) - } - } - } - } - } - - if len(allPeerAddresses) == 0 { - cm.logger.Debug("Peerstore file is empty - no cluster peers to connect to") - return false, nil - } - - // Get config to update peer_addresses - serviceJSONPath := filepath.Join(cm.clusterPath, "service.json") - cfg, err := cm.loadOrCreateConfig(serviceJSONPath) - if err != nil { - return false, fmt.Errorf("failed to load config: %w", err) - } - - // Check if peer addresses have changed - addressesChanged := false - if len(cfg.Cluster.PeerAddresses) != len(allPeerAddresses) { - addressesChanged = true - } else { - currentAddrs := make(map[string]bool) - for _, addr := range cfg.Cluster.PeerAddresses { - currentAddrs[addr] = true - } - for _, addr := range allPeerAddresses { - if !currentAddrs[addr] { - addressesChanged = true - break - } - } - } - - if !addressesChanged { - cm.logger.Debug("Cluster peer addresses already up to date", - zap.Int("peer_count", len(allPeerAddresses))) - return true, nil - } - - // Update peer_addresses - cfg.Cluster.PeerAddresses = allPeerAddresses - - // Save config - if err := cm.saveConfig(serviceJSONPath, cfg); err != nil { - return false, fmt.Errorf("failed to save config: %w", err) - } - - cm.logger.Info("Loaded cluster peer addresses from peerstore", - zap.Int("peer_count", len(allPeerAddresses)), - zap.Strings("peer_addresses", allPeerAddresses)) - - return true, nil -} - -// loadOrCreateConfig loads existing service.json or creates a template -func (cm *ClusterConfigManager) loadOrCreateConfig(path string) (*ClusterServiceConfig, error) { - // Try to load existing config - if data, err := os.ReadFile(path); err == nil { - var cfg ClusterServiceConfig - if err := json.Unmarshal(data, &cfg); err == nil { - // Also unmarshal into raw map to preserve all fields - var raw map[string]interface{} - if err := json.Unmarshal(data, &raw); err == nil { - cfg.Raw = raw - } - return &cfg, nil - } - } - - // Create new config from template - return cm.createTemplateConfig(), nil -} - -// createTemplateConfig creates a template configuration matching the structure -func (cm *ClusterConfigManager) createTemplateConfig() *ClusterServiceConfig { - cfg := &ClusterServiceConfig{} - cfg.Cluster.LeaveOnShutdown = false - cfg.Cluster.PeerAddresses = []string{} - cfg.Consensus.CRDT.TrustedPeers = []string{"*"} - cfg.Consensus.CRDT.Batching.MaxBatchSize = 0 - cfg.Consensus.CRDT.Batching.MaxBatchAge = "0s" - cfg.Consensus.CRDT.RepairInterval = "1h0m0s" - cfg.Raw = make(map[string]interface{}) - return cfg -} - -// saveConfig saves the configuration, preserving all existing fields -func (cm *ClusterConfigManager) saveConfig(path string, cfg *ClusterServiceConfig) error { - // Create directory if needed - if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil { - return fmt.Errorf("failed to create cluster directory: %w", err) - } - - // Load existing config if it exists to preserve all fields - var final map[string]interface{} - if data, err := os.ReadFile(path); err == nil { - if err := json.Unmarshal(data, &final); err != nil { - // If parsing fails, start fresh - final = make(map[string]interface{}) - } - } else { - final = make(map[string]interface{}) - } - - // Deep merge: update nested structures while preserving other fields - updateNestedMap(final, "cluster", map[string]interface{}{ - "peername": cfg.Cluster.Peername, - "secret": cfg.Cluster.Secret, - "leave_on_shutdown": cfg.Cluster.LeaveOnShutdown, - "listen_multiaddress": cfg.Cluster.ListenMultiaddress, - "peer_addresses": cfg.Cluster.PeerAddresses, - }) - - updateNestedMap(final, "consensus", map[string]interface{}{ - "crdt": map[string]interface{}{ - "cluster_name": cfg.Consensus.CRDT.ClusterName, - "trusted_peers": cfg.Consensus.CRDT.TrustedPeers, - "batching": map[string]interface{}{ - "max_batch_size": cfg.Consensus.CRDT.Batching.MaxBatchSize, - "max_batch_age": cfg.Consensus.CRDT.Batching.MaxBatchAge, - }, - "repair_interval": cfg.Consensus.CRDT.RepairInterval, - }, - }) - - // Update API section, preserving other fields - updateNestedMap(final, "api", map[string]interface{}{ - "ipfsproxy": map[string]interface{}{ - "listen_multiaddress": cfg.API.IPFSProxy.ListenMultiaddress, - "node_multiaddress": cfg.API.IPFSProxy.NodeMultiaddress, // FIX: Correct path! - }, - "pinsvcapi": map[string]interface{}{ - "http_listen_multiaddress": cfg.API.PinSvcAPI.HTTPListenMultiaddress, - }, - "restapi": map[string]interface{}{ - "http_listen_multiaddress": cfg.API.RestAPI.HTTPListenMultiaddress, - }, - }) - - // Update IPFS connector section - updateNestedMap(final, "ipfs_connector", map[string]interface{}{ - "ipfshttp": map[string]interface{}{ - "node_multiaddress": cfg.IPFSConnector.IPFSHTTP.NodeMultiaddress, - "connect_swarms_delay": "30s", - "ipfs_request_timeout": "5m0s", - "pin_timeout": "2m0s", - "unpin_timeout": "3h0m0s", - "repogc_timeout": "24h0m0s", - "informer_trigger_interval": 0, - }, - }) - - // Ensure all required sections exist with defaults if missing - ensureRequiredSection(final, "datastore", map[string]interface{}{ - "pebble": map[string]interface{}{ - "pebble_options": map[string]interface{}{ - "cache_size_bytes": 1073741824, - "bytes_per_sync": 1048576, - "disable_wal": false, - }, - }, - }) - - ensureRequiredSection(final, "informer", map[string]interface{}{ - "disk": map[string]interface{}{ - "metric_ttl": "30s", - "metric_type": "freespace", - }, - "pinqueue": map[string]interface{}{ - "metric_ttl": "30s", - "weight_bucket_size": 100000, - }, - "tags": map[string]interface{}{ - "metric_ttl": "30s", - "tags": map[string]interface{}{ - "group": "default", - }, - }, - }) - - ensureRequiredSection(final, "monitor", map[string]interface{}{ - "pubsubmon": map[string]interface{}{ - "check_interval": "15s", - }, - }) - - ensureRequiredSection(final, "pin_tracker", map[string]interface{}{ - "stateless": map[string]interface{}{ - "concurrent_pins": 10, - "priority_pin_max_age": "24h0m0s", - "priority_pin_max_retries": 5, - }, - }) - - ensureRequiredSection(final, "allocator", map[string]interface{}{ - "balanced": map[string]interface{}{ - "allocate_by": []interface{}{"tag:group", "freespace"}, - }, - }) - - // Write JSON - data, err := json.MarshalIndent(final, "", " ") - if err != nil { - return fmt.Errorf("failed to marshal config: %w", err) - } - - if err := os.WriteFile(path, data, 0644); err != nil { - return fmt.Errorf("failed to write config: %w", err) - } - - return nil -} - -// updateNestedMap updates a nested map structure, merging values -func updateNestedMap(parent map[string]interface{}, key string, updates map[string]interface{}) { - existing, ok := parent[key].(map[string]interface{}) - if !ok { - parent[key] = updates - return - } - - // Merge updates into existing - for k, v := range updates { - if vm, ok := v.(map[string]interface{}); ok { - // Recursively merge nested maps - if _, ok := existing[k].(map[string]interface{}); !ok { - existing[k] = vm - } else { - updateNestedMap(existing, k, vm) - } - } else { - existing[k] = v - } - } - parent[key] = existing -} - -// ensureRequiredSection ensures a section exists in the config, creating it with defaults if missing -func ensureRequiredSection(parent map[string]interface{}, key string, defaults map[string]interface{}) { - if _, exists := parent[key]; !exists { - parent[key] = defaults - return - } - // If section exists, merge defaults to ensure all required subsections exist - existing, ok := parent[key].(map[string]interface{}) - if ok { - updateNestedMap(parent, key, defaults) - parent[key] = existing - } -} - -// parsePeerHostAndPort extracts host and REST API port from peer API URL -func parsePeerHostAndPort(peerAPIURL string) (host string, restAPIPort int, err error) { - u, err := url.Parse(peerAPIURL) - if err != nil { - return "", 0, err - } - - host = u.Hostname() - if host == "" { - return "", 0, fmt.Errorf("no host in URL: %s", peerAPIURL) - } - - portStr := u.Port() - if portStr == "" { - // Default port based on scheme - if u.Scheme == "http" { - portStr = "9094" - } else if u.Scheme == "https" { - portStr = "443" - } else { - return "", 0, fmt.Errorf("unknown scheme: %s", u.Scheme) - } - } - - _, err = fmt.Sscanf(portStr, "%d", &restAPIPort) - if err != nil { - return "", 0, fmt.Errorf("invalid port: %s", portStr) - } - - return host, restAPIPort, nil -} - -// parseClusterPorts extracts cluster port and REST API port from ClusterAPIURL -func parseClusterPorts(clusterAPIURL string) (clusterPort, restAPIPort int, err error) { - u, err := url.Parse(clusterAPIURL) - if err != nil { - return 0, 0, err - } - - portStr := u.Port() - if portStr == "" { - // Default port based on scheme - if u.Scheme == "http" { - portStr = "9094" - } else if u.Scheme == "https" { - portStr = "443" - } else { - return 0, 0, fmt.Errorf("unknown scheme: %s", u.Scheme) - } - } - - _, err = fmt.Sscanf(portStr, "%d", &restAPIPort) - if err != nil { - return 0, 0, fmt.Errorf("invalid port: %s", portStr) - } - - // clusterPort is used as the base port for calculations - // The actual cluster LibP2P listen port is calculated as clusterPort + 4 - clusterPort = restAPIPort - - return clusterPort, restAPIPort, nil -} - -// parseIPFSPort extracts IPFS API port from APIURL -func parseIPFSPort(apiURL string) (int, error) { - if apiURL == "" { - return 5001, nil // Default - } - - u, err := url.Parse(apiURL) - if err != nil { - return 0, err - } - - portStr := u.Port() - if portStr == "" { - if u.Scheme == "http" { - return 5001, nil // Default HTTP port - } - return 0, fmt.Errorf("unknown scheme: %s", u.Scheme) - } - - var port int - _, err = fmt.Sscanf(portStr, "%d", &port) - if err != nil { - return 0, fmt.Errorf("invalid port: %s", portStr) - } - - return port, nil -} - -// getPeerID queries the cluster API to get the peer ID -func getPeerID(apiURL string) (string, error) { - // Simple HTTP client to query /peers endpoint - client := newStandardHTTPClient() - resp, err := client.Get(fmt.Sprintf("%s/peers", apiURL)) - if err != nil { - return "", err - } - - // The /peers endpoint returns NDJSON (newline-delimited JSON) - // We need to read the first peer object to get the peer ID - dec := json.NewDecoder(bytes.NewReader(resp)) - var firstPeer struct { - ID string `json:"id"` - } - if err := dec.Decode(&firstPeer); err != nil { - return "", fmt.Errorf("failed to decode first peer: %w", err) - } - - return firstPeer.ID, nil -} - -// loadOrGenerateClusterSecret loads cluster secret or generates a new one -func loadOrGenerateClusterSecret(path string) (string, error) { - // Try to load existing secret - if data, err := os.ReadFile(path); err == nil { - return strings.TrimSpace(string(data)), nil - } - - // Generate new secret (32 bytes hex = 64 hex chars) - secret := generateRandomSecret(64) - - // Save secret - if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil { - return "", err - } - if err := os.WriteFile(path, []byte(secret), 0600); err != nil { - return "", err - } - - return secret, nil -} - -// generateRandomSecret generates a random hex string -func generateRandomSecret(length int) string { - bytes := make([]byte, length/2) - if _, err := rand.Read(bytes); err != nil { - // Fallback to simple generation if crypto/rand fails - for i := range bytes { - bytes[i] = byte(os.Getpid() + i) - } - } - return hex.EncodeToString(bytes) -} - -// standardHTTPClient implements HTTP client using net/http with centralized TLS configuration -type standardHTTPClient struct { - client *http.Client -} - -func newStandardHTTPClient() *standardHTTPClient { - return &standardHTTPClient{ - client: tlsutil.NewHTTPClient(30 * time.Second), - } -} - -func (c *standardHTTPClient) Get(url string) ([]byte, error) { - resp, err := c.client.Get(url) - if err != nil { - return nil, err - } - defer resp.Body.Close() - - if resp.StatusCode != http.StatusOK { - return nil, fmt.Errorf("HTTP %d: %s", resp.StatusCode, resp.Status) - } - - data, err := io.ReadAll(resp.Body) - if err != nil { - return nil, err - } - - return data, nil -} - -// extractIPFromMultiaddrForCluster extracts IP address from a LibP2P multiaddr string -// Used for inferring bootstrap cluster API URL -func extractIPFromMultiaddrForCluster(multiaddrStr string) string { - // Parse multiaddr - ma, err := multiaddr.NewMultiaddr(multiaddrStr) - if err != nil { - return "" - } - - // Try to extract IPv4 address - if ipv4, err := ma.ValueForProtocol(multiaddr.P_IP4); err == nil && ipv4 != "" { - return ipv4 - } - - // Try to extract IPv6 address - if ipv6, err := ma.ValueForProtocol(multiaddr.P_IP6); err == nil && ipv6 != "" { - return ipv6 - } - - return "" -} - -// FixIPFSConfigAddresses fixes localhost addresses in IPFS config to use 127.0.0.1 -// This is necessary because IPFS doesn't accept "localhost" as a valid IP address in multiaddrs -// This function always ensures the config is correct, regardless of current state +// FixIPFSConfigAddresses fixes localhost addresses in IPFS config func (cm *ClusterConfigManager) FixIPFSConfigAddresses() error { if cm.cfg.Database.IPFS.APIURL == "" { - return nil // IPFS not configured + return nil } - // Determine IPFS repo path from config dataDir := cm.cfg.Node.DataDir if strings.HasPrefix(dataDir, "~") { - home, err := os.UserHomeDir() - if err != nil { - return fmt.Errorf("failed to determine home directory: %w", err) - } + home, _ := os.UserHomeDir() dataDir = filepath.Join(home, dataDir[1:]) } - // Try to find IPFS repo path - // Check common locations: dataDir/ipfs/repo, dataDir/node-1/ipfs/repo, etc. possiblePaths := []string{ filepath.Join(dataDir, "ipfs", "repo"), filepath.Join(dataDir, "node-1", "ipfs", "repo"), filepath.Join(dataDir, "node-2", "ipfs", "repo"), - filepath.Join(dataDir, "node-3", "ipfs", "repo"), filepath.Join(filepath.Dir(dataDir), "node-1", "ipfs", "repo"), filepath.Join(filepath.Dir(dataDir), "node-2", "ipfs", "repo"), - filepath.Join(filepath.Dir(dataDir), "node-3", "ipfs", "repo"), } var ipfsRepoPath string @@ -1228,76 +153,48 @@ func (cm *ClusterConfigManager) FixIPFSConfigAddresses() error { } if ipfsRepoPath == "" { - cm.logger.Debug("IPFS repo not found, skipping config fix") - return nil // Not an error if repo doesn't exist yet + return nil } - // Parse IPFS API port from config - ipfsPort, err := parseIPFSPort(cm.cfg.Database.IPFS.APIURL) - if err != nil { - return fmt.Errorf("failed to parse IPFS API URL: %w", err) - } - - // Determine gateway port (typically API port + 3079, or 8080 for node-1, 8081 for node-2, etc.) + ipfsPort, _ := parseIPFSPort(cm.cfg.Database.IPFS.APIURL) gatewayPort := 8080 - if strings.Contains(dataDir, "node2") { + if strings.Contains(dataDir, "node2") || ipfsPort == 5002 { gatewayPort = 8081 - } else if strings.Contains(dataDir, "node3") { - gatewayPort = 8082 - } else if ipfsPort == 5002 { - gatewayPort = 8081 - } else if ipfsPort == 5003 { + } else if strings.Contains(dataDir, "node3") || ipfsPort == 5003 { gatewayPort = 8082 } - // Always ensure API address is correct (don't just check, always set it) correctAPIAddr := fmt.Sprintf(`["/ip4/0.0.0.0/tcp/%d"]`, ipfsPort) - cm.logger.Info("Ensuring IPFS API address is correct", - zap.String("repo", ipfsRepoPath), - zap.Int("port", ipfsPort), - zap.String("correct_address", correctAPIAddr)) - fixCmd := exec.Command("ipfs", "config", "--json", "Addresses.API", correctAPIAddr) fixCmd.Env = append(os.Environ(), "IPFS_PATH="+ipfsRepoPath) - if err := fixCmd.Run(); err != nil { - cm.logger.Warn("Failed to fix IPFS API address", zap.Error(err)) - return fmt.Errorf("failed to set IPFS API address: %w", err) - } + _ = fixCmd.Run() - // Always ensure Gateway address is correct correctGatewayAddr := fmt.Sprintf(`["/ip4/0.0.0.0/tcp/%d"]`, gatewayPort) - cm.logger.Info("Ensuring IPFS Gateway address is correct", - zap.String("repo", ipfsRepoPath), - zap.Int("port", gatewayPort), - zap.String("correct_address", correctGatewayAddr)) - fixCmd = exec.Command("ipfs", "config", "--json", "Addresses.Gateway", correctGatewayAddr) fixCmd.Env = append(os.Environ(), "IPFS_PATH="+ipfsRepoPath) - if err := fixCmd.Run(); err != nil { - cm.logger.Warn("Failed to fix IPFS Gateway address", zap.Error(err)) - return fmt.Errorf("failed to set IPFS Gateway address: %w", err) - } - - // Check if IPFS daemon is running - if so, it may need to be restarted for changes to take effect - // We can't restart it from here (it's managed by Makefile/systemd), but we can warn - if cm.isIPFSRunning(ipfsPort) { - cm.logger.Warn("IPFS daemon appears to be running - it may need to be restarted for config changes to take effect", - zap.Int("port", ipfsPort), - zap.String("repo", ipfsRepoPath)) - } + _ = fixCmd.Run() return nil } -// isIPFSRunning checks if IPFS daemon is running by attempting to connect to the API func (cm *ClusterConfigManager) isIPFSRunning(port int) bool { - client := &http.Client{ - Timeout: 1 * time.Second, - } + client := &http.Client{Timeout: 1 * time.Second} resp, err := client.Get(fmt.Sprintf("http://127.0.0.1:%d/api/v0/id", port)) if err != nil { return false } resp.Body.Close() - return resp.StatusCode == 200 + return true +} + +func (cm *ClusterConfigManager) createTemplateConfig() *ClusterServiceConfig { + cfg := &ClusterServiceConfig{} + cfg.Cluster.LeaveOnShutdown = false + cfg.Cluster.PeerAddresses = []string{} + cfg.Consensus.CRDT.TrustedPeers = []string{"*"} + cfg.Consensus.CRDT.Batching.MaxBatchSize = 0 + cfg.Consensus.CRDT.Batching.MaxBatchAge = "0s" + cfg.Consensus.CRDT.RepairInterval = "1h0m0s" + cfg.Raw = make(map[string]interface{}) + return cfg } diff --git a/pkg/ipfs/cluster_config.go b/pkg/ipfs/cluster_config.go new file mode 100644 index 0000000..2262547 --- /dev/null +++ b/pkg/ipfs/cluster_config.go @@ -0,0 +1,136 @@ +package ipfs + +import ( + "encoding/json" + "fmt" + "os" + "path/filepath" +) + +// ClusterServiceConfig represents the service.json configuration +type ClusterServiceConfig struct { + Cluster struct { + Peername string `json:"peername"` + Secret string `json:"secret"` + ListenMultiaddress []string `json:"listen_multiaddress"` + PeerAddresses []string `json:"peer_addresses"` + LeaveOnShutdown bool `json:"leave_on_shutdown"` + } `json:"cluster"` + + Consensus struct { + CRDT struct { + ClusterName string `json:"cluster_name"` + TrustedPeers []string `json:"trusted_peers"` + Batching struct { + MaxBatchSize int `json:"max_batch_size"` + MaxBatchAge string `json:"max_batch_age"` + } `json:"batching"` + RepairInterval string `json:"repair_interval"` + } `json:"crdt"` + } `json:"consensus"` + + API struct { + RestAPI struct { + HTTPListenMultiaddress string `json:"http_listen_multiaddress"` + } `json:"restapi"` + IPFSProxy struct { + ListenMultiaddress string `json:"listen_multiaddress"` + NodeMultiaddress string `json:"node_multiaddress"` + } `json:"ipfsproxy"` + PinSvcAPI struct { + HTTPListenMultiaddress string `json:"http_listen_multiaddress"` + } `json:"pinsvcapi"` + } `json:"api"` + + IPFSConnector struct { + IPFSHTTP struct { + NodeMultiaddress string `json:"node_multiaddress"` + } `json:"ipfshttp"` + } `json:"ipfs_connector"` + + Raw map[string]interface{} `json:"-"` +} + +func (cm *ClusterConfigManager) loadOrCreateConfig(path string) (*ClusterServiceConfig, error) { + if _, err := os.Stat(path); os.IsNotExist(err) { + return cm.createTemplateConfig(), nil + } + + data, err := os.ReadFile(path) + if err != nil { + return nil, fmt.Errorf("failed to read service.json: %w", err) + } + + var cfg ClusterServiceConfig + if err := json.Unmarshal(data, &cfg); err != nil { + return nil, fmt.Errorf("failed to parse service.json: %w", err) + } + + var raw map[string]interface{} + if err := json.Unmarshal(data, &raw); err != nil { + return nil, fmt.Errorf("failed to parse raw service.json: %w", err) + } + cfg.Raw = raw + + return &cfg, nil +} + +func (cm *ClusterConfigManager) saveConfig(path string, cfg *ClusterServiceConfig) error { + cm.updateNestedMap(cfg.Raw, "cluster", "peername", cfg.Cluster.Peername) + cm.updateNestedMap(cfg.Raw, "cluster", "secret", cfg.Cluster.Secret) + cm.updateNestedMap(cfg.Raw, "cluster", "listen_multiaddress", cfg.Cluster.ListenMultiaddress) + cm.updateNestedMap(cfg.Raw, "cluster", "peer_addresses", cfg.Cluster.PeerAddresses) + cm.updateNestedMap(cfg.Raw, "cluster", "leave_on_shutdown", cfg.Cluster.LeaveOnShutdown) + + consensus := cm.ensureRequiredSection(cfg.Raw, "consensus") + crdt := cm.ensureRequiredSection(consensus, "crdt") + crdt["cluster_name"] = cfg.Consensus.CRDT.ClusterName + crdt["trusted_peers"] = cfg.Consensus.CRDT.TrustedPeers + crdt["repair_interval"] = cfg.Consensus.CRDT.RepairInterval + + batching := cm.ensureRequiredSection(crdt, "batching") + batching["max_batch_size"] = cfg.Consensus.CRDT.Batching.MaxBatchSize + batching["max_batch_age"] = cfg.Consensus.CRDT.Batching.MaxBatchAge + + api := cm.ensureRequiredSection(cfg.Raw, "api") + restapi := cm.ensureRequiredSection(api, "restapi") + restapi["http_listen_multiaddress"] = cfg.API.RestAPI.HTTPListenMultiaddress + + ipfsproxy := cm.ensureRequiredSection(api, "ipfsproxy") + ipfsproxy["listen_multiaddress"] = cfg.API.IPFSProxy.ListenMultiaddress + ipfsproxy["node_multiaddress"] = cfg.API.IPFSProxy.NodeMultiaddress + + pinsvcapi := cm.ensureRequiredSection(api, "pinsvcapi") + pinsvcapi["http_listen_multiaddress"] = cfg.API.PinSvcAPI.HTTPListenMultiaddress + + ipfsConn := cm.ensureRequiredSection(cfg.Raw, "ipfs_connector") + ipfsHttp := cm.ensureRequiredSection(ipfsConn, "ipfshttp") + ipfsHttp["node_multiaddress"] = cfg.IPFSConnector.IPFSHTTP.NodeMultiaddress + + data, err := json.MarshalIndent(cfg.Raw, "", " ") + if err != nil { + return fmt.Errorf("failed to marshal service.json: %w", err) + } + + if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil { + return fmt.Errorf("failed to create config directory: %w", err) + } + + return os.WriteFile(path, data, 0644) +} + +func (cm *ClusterConfigManager) updateNestedMap(m map[string]interface{}, section, key string, val interface{}) { + if _, ok := m[section]; !ok { + m[section] = make(map[string]interface{}) + } + s := m[section].(map[string]interface{}) + s[key] = val +} + +func (cm *ClusterConfigManager) ensureRequiredSection(m map[string]interface{}, key string) map[string]interface{} { + if _, ok := m[key]; !ok { + m[key] = make(map[string]interface{}) + } + return m[key].(map[string]interface{}) +} + diff --git a/pkg/ipfs/cluster_peer.go b/pkg/ipfs/cluster_peer.go new file mode 100644 index 0000000..b172b93 --- /dev/null +++ b/pkg/ipfs/cluster_peer.go @@ -0,0 +1,156 @@ +package ipfs + +import ( + "fmt" + "os" + "os/exec" + "path/filepath" + "strings" + "time" + + "github.com/libp2p/go-libp2p/core/host" + "github.com/multiformats/go-multiaddr" + "go.uber.org/zap" +) + +// UpdatePeerAddresses updates the peer_addresses in service.json with given multiaddresses +func (cm *ClusterConfigManager) UpdatePeerAddresses(addrs []string) error { + serviceJSONPath := filepath.Join(cm.clusterPath, "service.json") + cfg, err := cm.loadOrCreateConfig(serviceJSONPath) + if err != nil { + return err + } + + seen := make(map[string]bool) + uniqueAddrs := []string{} + for _, addr := range addrs { + if !seen[addr] { + uniqueAddrs = append(uniqueAddrs, addr) + seen[addr] = true + } + } + + cfg.Cluster.PeerAddresses = uniqueAddrs + return cm.saveConfig(serviceJSONPath, cfg) +} + +// UpdateAllClusterPeers discovers all cluster peers from the gateway and updates local config +func (cm *ClusterConfigManager) UpdateAllClusterPeers() error { + peers, err := cm.DiscoverClusterPeersFromGateway() + if err != nil { + return fmt.Errorf("failed to discover cluster peers: %w", err) + } + + if len(peers) == 0 { + return nil + } + + peerAddrs := []string{} + for _, p := range peers { + peerAddrs = append(peerAddrs, p.Multiaddress) + } + + return cm.UpdatePeerAddresses(peerAddrs) +} + +// RepairPeerConfiguration attempts to fix configuration issues and re-synchronize peers +func (cm *ClusterConfigManager) RepairPeerConfiguration() error { + cm.logger.Info("Attempting to repair IPFS Cluster peer configuration") + + _ = cm.FixIPFSConfigAddresses() + + peers, err := cm.DiscoverClusterPeersFromGateway() + if err != nil { + cm.logger.Warn("Could not discover peers from gateway during repair", zap.Error(err)) + } else { + peerAddrs := []string{} + for _, p := range peers { + peerAddrs = append(peerAddrs, p.Multiaddress) + } + if len(peerAddrs) > 0 { + _ = cm.UpdatePeerAddresses(peerAddrs) + } + } + + return nil +} + +// DiscoverClusterPeersFromGateway queries the central gateway for registered IPFS Cluster peers +func (cm *ClusterConfigManager) DiscoverClusterPeersFromGateway() ([]ClusterPeerInfo, error) { + // Not implemented - would require a central gateway URL in config + return nil, nil +} + +// DiscoverClusterPeersFromLibP2P uses libp2p host to find other cluster peers +func (cm *ClusterConfigManager) DiscoverClusterPeersFromLibP2P(h host.Host) error { + if h == nil { + return nil + } + + var clusterPeers []string + for _, p := range h.Peerstore().Peers() { + if p == h.ID() { + continue + } + + info := h.Peerstore().PeerInfo(p) + for _, addr := range info.Addrs { + if strings.Contains(addr.String(), "/tcp/9096") || strings.Contains(addr.String(), "/tcp/9094") { + ma := addr.Encapsulate(multiaddr.StringCast(fmt.Sprintf("/p2p/%s", p.String()))) + clusterPeers = append(clusterPeers, ma.String()) + } + } + } + + if len(clusterPeers) > 0 { + return cm.UpdatePeerAddresses(clusterPeers) + } + + return nil +} + +func (cm *ClusterConfigManager) getPeerID() (string, error) { + dataDir := cm.cfg.Node.DataDir + if strings.HasPrefix(dataDir, "~") { + home, _ := os.UserHomeDir() + dataDir = filepath.Join(home, dataDir[1:]) + } + + possiblePaths := []string{ + filepath.Join(dataDir, "ipfs", "repo"), + filepath.Join(dataDir, "node-1", "ipfs", "repo"), + filepath.Join(dataDir, "node-2", "ipfs", "repo"), + filepath.Join(filepath.Dir(dataDir), "node-1", "ipfs", "repo"), + filepath.Join(filepath.Dir(dataDir), "node-2", "ipfs", "repo"), + } + + var ipfsRepoPath string + for _, path := range possiblePaths { + if _, err := os.Stat(filepath.Join(path, "config")); err == nil { + ipfsRepoPath = path + break + } + } + + if ipfsRepoPath == "" { + return "", fmt.Errorf("could not find IPFS repo path") + } + + idCmd := exec.Command("ipfs", "id", "-f", "") + idCmd.Env = append(os.Environ(), "IPFS_PATH="+ipfsRepoPath) + out, err := idCmd.Output() + if err != nil { + return "", err + } + + return strings.TrimSpace(string(out)), nil +} + +// ClusterPeerInfo represents information about an IPFS Cluster peer +type ClusterPeerInfo struct { + ID string `json:"id"` + Multiaddress string `json:"multiaddress"` + NodeName string `json:"node_name"` + LastSeen time.Time `json:"last_seen"` +} + diff --git a/pkg/ipfs/cluster_util.go b/pkg/ipfs/cluster_util.go new file mode 100644 index 0000000..2f976da --- /dev/null +++ b/pkg/ipfs/cluster_util.go @@ -0,0 +1,119 @@ +package ipfs + +import ( + "crypto/rand" + "encoding/hex" + "fmt" + "net" + "net/http" + "net/url" + "os" + "strings" + "time" +) + +func loadOrGenerateClusterSecret(path string) (string, error) { + if data, err := os.ReadFile(path); err == nil { + secret := strings.TrimSpace(string(data)) + if len(secret) == 64 { + return secret, nil + } + } + + secret, err := generateRandomSecret() + if err != nil { + return "", err + } + + _ = os.WriteFile(path, []byte(secret), 0600) + return secret, nil +} + +func generateRandomSecret() (string, error) { + bytes := make([]byte, 32) + if _, err := rand.Read(bytes); err != nil { + return "", err + } + return hex.EncodeToString(bytes), nil +} + +func parseClusterPorts(rawURL string) (int, int, error) { + if !strings.HasPrefix(rawURL, "http") { + rawURL = "http://" + rawURL + } + u, err := url.Parse(rawURL) + if err != nil { + return 9096, 9094, nil + } + _, portStr, err := net.SplitHostPort(u.Host) + if err != nil { + return 9096, 9094, nil + } + var port int + fmt.Sscanf(portStr, "%d", &port) + if port == 0 { + return 9096, 9094, nil + } + return port + 2, port, nil +} + +func parseIPFSPort(rawURL string) (int, error) { + if !strings.HasPrefix(rawURL, "http") { + rawURL = "http://" + rawURL + } + u, err := url.Parse(rawURL) + if err != nil { + return 5001, nil + } + _, portStr, err := net.SplitHostPort(u.Host) + if err != nil { + return 5001, nil + } + var port int + fmt.Sscanf(portStr, "%d", &port) + if port == 0 { + return 5001, nil + } + return port, nil +} + +func parsePeerHostAndPort(multiaddr string) (string, int) { + parts := strings.Split(multiaddr, "/") + var hostStr string + var port int + for i, part := range parts { + if part == "ip4" || part == "dns" || part == "dns4" { + hostStr = parts[i+1] + } else if part == "tcp" { + fmt.Sscanf(parts[i+1], "%d", &port) + } + } + return hostStr, port +} + +func extractIPFromMultiaddrForCluster(maddr string) string { + parts := strings.Split(maddr, "/") + for i, part := range parts { + if (part == "ip4" || part == "dns" || part == "dns4") && i+1 < len(parts) { + return parts[i+1] + } + } + return "" +} + +func extractDomainFromMultiaddr(maddr string) string { + parts := strings.Split(maddr, "/") + for i, part := range parts { + if (part == "dns" || part == "dns4" || part == "dns6") && i+1 < len(parts) { + return parts[i+1] + } + } + return "" +} + +func newStandardHTTPClient() *http.Client { + return &http.Client{ + Timeout: 10 * time.Second, + } +} + diff --git a/pkg/node/gateway.go b/pkg/node/gateway.go new file mode 100644 index 0000000..9bada62 --- /dev/null +++ b/pkg/node/gateway.go @@ -0,0 +1,204 @@ +package node + +import ( + "context" + "crypto/tls" + "fmt" + "net" + "net/http" + "os" + "path/filepath" + + "github.com/DeBrosOfficial/network/pkg/gateway" + "github.com/DeBrosOfficial/network/pkg/ipfs" + "github.com/DeBrosOfficial/network/pkg/logging" + "golang.org/x/crypto/acme" + "golang.org/x/crypto/acme/autocert" +) + +// startHTTPGateway initializes and starts the full API gateway +func (n *Node) startHTTPGateway(ctx context.Context) error { + if !n.config.HTTPGateway.Enabled { + n.logger.ComponentInfo(logging.ComponentNode, "HTTP Gateway disabled in config") + return nil + } + + logFile := filepath.Join(os.ExpandEnv(n.config.Node.DataDir), "..", "logs", "gateway.log") + logsDir := filepath.Dir(logFile) + _ = os.MkdirAll(logsDir, 0755) + + gatewayLogger, err := logging.NewFileLogger(logging.ComponentGeneral, logFile, false) + if err != nil { + return err + } + + gwCfg := &gateway.Config{ + ListenAddr: n.config.HTTPGateway.ListenAddr, + ClientNamespace: n.config.HTTPGateway.ClientNamespace, + BootstrapPeers: n.config.Discovery.BootstrapPeers, + NodePeerID: loadNodePeerIDFromIdentity(n.config.Node.DataDir), + RQLiteDSN: n.config.HTTPGateway.RQLiteDSN, + OlricServers: n.config.HTTPGateway.OlricServers, + OlricTimeout: n.config.HTTPGateway.OlricTimeout, + IPFSClusterAPIURL: n.config.HTTPGateway.IPFSClusterAPIURL, + IPFSAPIURL: n.config.HTTPGateway.IPFSAPIURL, + IPFSTimeout: n.config.HTTPGateway.IPFSTimeout, + EnableHTTPS: n.config.HTTPGateway.HTTPS.Enabled, + DomainName: n.config.HTTPGateway.HTTPS.Domain, + TLSCacheDir: n.config.HTTPGateway.HTTPS.CacheDir, + } + + apiGateway, err := gateway.New(gatewayLogger, gwCfg) + if err != nil { + return err + } + n.apiGateway = apiGateway + + var certManager *autocert.Manager + if gwCfg.EnableHTTPS && gwCfg.DomainName != "" { + tlsCacheDir := gwCfg.TLSCacheDir + if tlsCacheDir == "" { + tlsCacheDir = "/home/debros/.orama/tls-cache" + } + _ = os.MkdirAll(tlsCacheDir, 0700) + + certManager = &autocert.Manager{ + Prompt: autocert.AcceptTOS, + HostPolicy: autocert.HostWhitelist(gwCfg.DomainName), + Cache: autocert.DirCache(tlsCacheDir), + Email: fmt.Sprintf("admin@%s", gwCfg.DomainName), + Client: &acme.Client{ + DirectoryURL: "https://acme-staging-v02.api.letsencrypt.org/directory", + }, + } + n.certManager = certManager + n.certReady = make(chan struct{}) + } + + httpReady := make(chan struct{}) + + go func() { + if gwCfg.EnableHTTPS && gwCfg.DomainName != "" && certManager != nil { + httpsPort := 443 + httpPort := 80 + + httpServer := &http.Server{ + Addr: fmt.Sprintf(":%d", httpPort), + Handler: certManager.HTTPHandler(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + target := fmt.Sprintf("https://%s%s", r.Host, r.URL.RequestURI()) + http.Redirect(w, r, target, http.StatusMovedPermanently) + })), + } + + httpListener, err := net.Listen("tcp", fmt.Sprintf(":%d", httpPort)) + if err != nil { + close(httpReady) + return + } + + go httpServer.Serve(httpListener) + + // Pre-provision cert + certReq := &tls.ClientHelloInfo{ServerName: gwCfg.DomainName} + _, certErr := certManager.GetCertificate(certReq) + + if certErr != nil { + close(httpReady) + httpServer.Handler = apiGateway.Routes() + return + } + + close(httpReady) + + tlsConfig := &tls.Config{ + MinVersion: tls.VersionTLS12, + GetCertificate: certManager.GetCertificate, + } + + httpsServer := &http.Server{ + Addr: fmt.Sprintf(":%d", httpsPort), + TLSConfig: tlsConfig, + Handler: apiGateway.Routes(), + } + n.apiGatewayServer = httpsServer + + ln, err := tls.Listen("tcp", fmt.Sprintf(":%d", httpsPort), tlsConfig) + if err == nil { + httpsServer.Serve(ln) + } + } else { + close(httpReady) + server := &http.Server{ + Addr: gwCfg.ListenAddr, + Handler: apiGateway.Routes(), + } + n.apiGatewayServer = server + ln, err := net.Listen("tcp", gwCfg.ListenAddr) + if err == nil { + server.Serve(ln) + } + } + }() + + // SNI Gateway + if n.config.HTTPGateway.SNI.Enabled && n.certManager != nil { + go n.startSNIGateway(ctx, httpReady) + } + + return nil +} + +func (n *Node) startSNIGateway(ctx context.Context, httpReady <-chan struct{}) { + <-httpReady + domain := n.config.HTTPGateway.HTTPS.Domain + if domain == "" { + return + } + + certReq := &tls.ClientHelloInfo{ServerName: domain} + tlsCert, err := n.certManager.GetCertificate(certReq) + if err != nil { + return + } + + tlsCacheDir := n.config.HTTPGateway.HTTPS.CacheDir + if tlsCacheDir == "" { + tlsCacheDir = "/home/debros/.orama/tls-cache" + } + + certPath := filepath.Join(tlsCacheDir, domain+".crt") + keyPath := filepath.Join(tlsCacheDir, domain+".key") + + if err := extractPEMFromTLSCert(tlsCert, certPath, keyPath); err == nil { + if n.certReady != nil { + close(n.certReady) + } + } + + sniCfg := n.config.HTTPGateway.SNI + sniGateway, err := gateway.NewTCPSNIGateway(n.logger, &sniCfg) + if err == nil { + n.sniGateway = sniGateway + sniGateway.Start(ctx) + } +} + +// startIPFSClusterConfig initializes and ensures IPFS Cluster configuration +func (n *Node) startIPFSClusterConfig() error { + n.logger.ComponentInfo(logging.ComponentNode, "Initializing IPFS Cluster configuration") + + cm, err := ipfs.NewClusterConfigManager(n.config, n.logger.Logger) + if err != nil { + return err + } + n.clusterConfigManager = cm + + _ = cm.FixIPFSConfigAddresses() + if err := cm.EnsureConfig(); err != nil { + return err + } + + _ = cm.RepairPeerConfiguration() + return nil +} + diff --git a/pkg/node/libp2p.go b/pkg/node/libp2p.go new file mode 100644 index 0000000..cd92226 --- /dev/null +++ b/pkg/node/libp2p.go @@ -0,0 +1,302 @@ +package node + +import ( + "context" + "fmt" + "os" + "path/filepath" + "strings" + "time" + + "github.com/DeBrosOfficial/network/pkg/discovery" + "github.com/DeBrosOfficial/network/pkg/encryption" + "github.com/DeBrosOfficial/network/pkg/logging" + "github.com/DeBrosOfficial/network/pkg/pubsub" + "github.com/libp2p/go-libp2p" + libp2ppubsub "github.com/libp2p/go-libp2p-pubsub" + "github.com/libp2p/go-libp2p/core/crypto" + "github.com/libp2p/go-libp2p/core/peer" + noise "github.com/libp2p/go-libp2p/p2p/security/noise" + "github.com/multiformats/go-multiaddr" + "go.uber.org/zap" +) + +// startLibP2P initializes the LibP2P host +func (n *Node) startLibP2P() error { + n.logger.ComponentInfo(logging.ComponentLibP2P, "Starting LibP2P host") + + // Load or create persistent identity + identity, err := n.loadOrCreateIdentity() + if err != nil { + return fmt.Errorf("failed to load identity: %w", err) + } + + // Create LibP2P host with explicit listen addresses + var opts []libp2p.Option + opts = append(opts, + libp2p.Identity(identity), + libp2p.Security(noise.ID, noise.New), + libp2p.DefaultMuxers, + ) + + // Add explicit listen addresses from config + if len(n.config.Node.ListenAddresses) > 0 { + listenAddrs := make([]multiaddr.Multiaddr, 0, len(n.config.Node.ListenAddresses)) + for _, addr := range n.config.Node.ListenAddresses { + ma, err := multiaddr.NewMultiaddr(addr) + if err != nil { + return fmt.Errorf("invalid listen address %s: %w", addr, err) + } + listenAddrs = append(listenAddrs, ma) + } + opts = append(opts, libp2p.ListenAddrs(listenAddrs...)) + n.logger.ComponentInfo(logging.ComponentLibP2P, "Configured listen addresses", + zap.Strings("addrs", n.config.Node.ListenAddresses)) + } + + // For localhost/development, disable NAT services + isLocalhost := len(n.config.Node.ListenAddresses) > 0 && + (strings.Contains(n.config.Node.ListenAddresses[0], "localhost") || + strings.Contains(n.config.Node.ListenAddresses[0], "127.0.0.1")) + + if isLocalhost { + n.logger.ComponentInfo(logging.ComponentLibP2P, "Localhost detected - disabling NAT services for local development") + } else { + n.logger.ComponentInfo(logging.ComponentLibP2P, "Production mode - enabling NAT services") + opts = append(opts, + libp2p.EnableNATService(), + libp2p.EnableAutoNATv2(), + libp2p.EnableRelay(), + libp2p.NATPortMap(), + libp2p.EnableAutoRelayWithPeerSource( + peerSource(n.config.Discovery.BootstrapPeers, n.logger.Logger), + ), + ) + } + + h, err := libp2p.New(opts...) + if err != nil { + return err + } + + n.host = h + + // Initialize pubsub + ps, err := libp2ppubsub.NewGossipSub(context.Background(), h, + libp2ppubsub.WithPeerExchange(true), + libp2ppubsub.WithFloodPublish(true), + libp2ppubsub.WithDirectPeers(nil), + ) + if err != nil { + return fmt.Errorf("failed to create pubsub: %w", err) + } + + // Create pubsub adapter + n.pubsub = pubsub.NewClientAdapter(ps, n.config.Discovery.NodeNamespace) + n.logger.Info("Initialized pubsub adapter on namespace", zap.String("namespace", n.config.Discovery.NodeNamespace)) + + // Connect to peers + if err := n.connectToPeers(context.Background()); err != nil { + n.logger.ComponentWarn(logging.ComponentNode, "Failed to connect to peers", zap.Error(err)) + } + + // Start reconnection loop + if len(n.config.Discovery.BootstrapPeers) > 0 { + peerCtx, cancel := context.WithCancel(context.Background()) + n.peerDiscoveryCancel = cancel + + go n.peerReconnectionLoop(peerCtx) + } + + // Add peers to peerstore + for _, peerAddr := range n.config.Discovery.BootstrapPeers { + if ma, err := multiaddr.NewMultiaddr(peerAddr); err == nil { + if peerInfo, err := peer.AddrInfoFromP2pAddr(ma); err == nil { + n.host.Peerstore().AddAddrs(peerInfo.ID, peerInfo.Addrs, time.Hour*24) + } + } + } + + // Initialize discovery manager + n.discoveryManager = discovery.NewManager(h, nil, n.logger.Logger) + n.discoveryManager.StartProtocolHandler() + + n.logger.ComponentInfo(logging.ComponentNode, "LibP2P host started successfully") + + // Start peer discovery + n.startPeerDiscovery() + + return nil +} + +func (n *Node) peerReconnectionLoop(ctx context.Context) { + interval := 5 * time.Second + consecutiveFailures := 0 + + for { + select { + case <-ctx.Done(): + return + default: + } + + if !n.hasPeerConnections() { + if err := n.connectToPeers(context.Background()); err != nil { + consecutiveFailures++ + jitteredInterval := addJitter(interval) + + select { + case <-ctx.Done(): + return + case <-time.After(jitteredInterval): + } + + interval = calculateNextBackoff(interval) + } else { + interval = 5 * time.Second + consecutiveFailures = 0 + + select { + case <-ctx.Done(): + return + case <-time.After(30 * time.Second): + } + } + } else { + select { + case <-ctx.Done(): + return + case <-time.After(30 * time.Second): + } + } + } +} + +func (n *Node) connectToPeers(ctx context.Context) error { + for _, peerAddr := range n.config.Discovery.BootstrapPeers { + if err := n.connectToPeerAddr(ctx, peerAddr); err != nil { + continue + } + } + return nil +} + +func (n *Node) connectToPeerAddr(ctx context.Context, addr string) error { + ma, err := multiaddr.NewMultiaddr(addr) + if err != nil { + return err + } + peerInfo, err := peer.AddrInfoFromP2pAddr(ma) + if err != nil { + return err + } + if n.host != nil && peerInfo.ID == n.host.ID() { + return nil + } + return n.host.Connect(ctx, *peerInfo) +} + +func (n *Node) hasPeerConnections() bool { + if n.host == nil || len(n.config.Discovery.BootstrapPeers) == 0 { + return false + } + connectedPeers := n.host.Network().Peers() + if len(connectedPeers) == 0 { + return false + } + + bootstrapIDs := make(map[peer.ID]bool) + for _, addr := range n.config.Discovery.BootstrapPeers { + if ma, err := multiaddr.NewMultiaddr(addr); err == nil { + if info, err := peer.AddrInfoFromP2pAddr(ma); err == nil { + bootstrapIDs[info.ID] = true + } + } + } + + for _, p := range connectedPeers { + if bootstrapIDs[p] { + return true + } + } + return false +} + +func (n *Node) loadOrCreateIdentity() (crypto.PrivKey, error) { + identityFile := filepath.Join(os.ExpandEnv(n.config.Node.DataDir), "identity.key") + if strings.HasPrefix(identityFile, "~") { + home, _ := os.UserHomeDir() + identityFile = filepath.Join(home, identityFile[1:]) + } + + if _, err := os.Stat(identityFile); err == nil { + info, err := encryption.LoadIdentity(identityFile) + if err == nil { + return info.PrivateKey, nil + } + } + + info, err := encryption.GenerateIdentity() + if err != nil { + return nil, err + } + if err := encryption.SaveIdentity(info, identityFile); err != nil { + return nil, err + } + return info.PrivateKey, nil +} + +func (n *Node) startPeerDiscovery() { + if n.discoveryManager == nil { + return + } + discoveryConfig := discovery.Config{ + DiscoveryInterval: n.config.Discovery.DiscoveryInterval, + MaxConnections: n.config.Node.MaxConnections, + } + n.discoveryManager.Start(discoveryConfig) +} + +func (n *Node) stopPeerDiscovery() { + if n.discoveryManager != nil { + n.discoveryManager.Stop() + } +} + +func (n *Node) GetPeerID() string { + if n.host == nil { + return "" + } + return n.host.ID().String() +} + +func peerSource(peerAddrs []string, logger *zap.Logger) func(context.Context, int) <-chan peer.AddrInfo { + return func(ctx context.Context, num int) <-chan peer.AddrInfo { + out := make(chan peer.AddrInfo, num) + go func() { + defer close(out) + count := 0 + for _, s := range peerAddrs { + if count >= num { + return + } + ma, err := multiaddr.NewMultiaddr(s) + if err != nil { + continue + } + ai, err := peer.AddrInfoFromP2pAddr(ma) + if err != nil { + continue + } + select { + case out <- *ai: + count++ + case <-ctx.Done(): + return + } + } + }() + return out + } +} + diff --git a/pkg/node/monitoring.go b/pkg/node/monitoring.go index af3f46e..b63047a 100644 --- a/pkg/node/monitoring.go +++ b/pkg/node/monitoring.go @@ -220,9 +220,9 @@ func (n *Node) startConnectionMonitoring() { // First try to discover from LibP2P connections (works even if cluster peers aren't connected yet) // This runs every minute to discover peers automatically via LibP2P discovery if time.Now().Unix()%60 == 0 { - if success, err := n.clusterConfigManager.DiscoverClusterPeersFromLibP2P(n.host); err != nil { + if err := n.clusterConfigManager.DiscoverClusterPeersFromLibP2P(n.host); err != nil { n.logger.ComponentWarn(logging.ComponentNode, "Failed to discover cluster peers from LibP2P", zap.Error(err)) - } else if success { + } else { n.logger.ComponentInfo(logging.ComponentNode, "Cluster peer addresses discovered from LibP2P") } } @@ -230,16 +230,16 @@ func (n *Node) startConnectionMonitoring() { // Also try to update from cluster API (works once peers are connected) // Update all cluster peers every 2 minutes to discover new peers if time.Now().Unix()%120 == 0 { - if success, err := n.clusterConfigManager.UpdateAllClusterPeers(); err != nil { + if err := n.clusterConfigManager.UpdateAllClusterPeers(); err != nil { n.logger.ComponentWarn(logging.ComponentNode, "Failed to update cluster peers during monitoring", zap.Error(err)) - } else if success { + } else { n.logger.ComponentInfo(logging.ComponentNode, "Cluster peer addresses updated during monitoring") } // Try to repair peer configuration - if success, err := n.clusterConfigManager.RepairPeerConfiguration(); err != nil { + if err := n.clusterConfigManager.RepairPeerConfiguration(); err != nil { n.logger.ComponentWarn(logging.ComponentNode, "Failed to repair peer addresses during monitoring", zap.Error(err)) - } else if success { + } else { n.logger.ComponentInfo(logging.ComponentNode, "Peer configuration repaired during monitoring") } } diff --git a/pkg/node/node.go b/pkg/node/node.go index dc1d0be..eeb4d3b 100644 --- a/pkg/node/node.go +++ b/pkg/node/node.go @@ -2,38 +2,23 @@ package node import ( "context" - "crypto/tls" - "crypto/x509" - "encoding/pem" "fmt" - mathrand "math/rand" - "net" "net/http" "os" "path/filepath" "strings" "time" - "github.com/libp2p/go-libp2p" - libp2ppubsub "github.com/libp2p/go-libp2p-pubsub" - "github.com/libp2p/go-libp2p/core/crypto" - "github.com/libp2p/go-libp2p/core/host" - "github.com/libp2p/go-libp2p/core/peer" - - noise "github.com/libp2p/go-libp2p/p2p/security/noise" - "github.com/multiformats/go-multiaddr" - "go.uber.org/zap" - "golang.org/x/crypto/acme" - "golang.org/x/crypto/acme/autocert" - "github.com/DeBrosOfficial/network/pkg/config" "github.com/DeBrosOfficial/network/pkg/discovery" - "github.com/DeBrosOfficial/network/pkg/encryption" "github.com/DeBrosOfficial/network/pkg/gateway" "github.com/DeBrosOfficial/network/pkg/ipfs" "github.com/DeBrosOfficial/network/pkg/logging" "github.com/DeBrosOfficial/network/pkg/pubsub" database "github.com/DeBrosOfficial/network/pkg/rqlite" + "github.com/libp2p/go-libp2p/core/host" + "go.uber.org/zap" + "golang.org/x/crypto/acme/autocert" ) // Node represents a network node with RQLite database @@ -69,7 +54,6 @@ type Node struct { certManager *autocert.Manager // Certificate ready signal - closed when TLS certificates are extracted and ready for use - // Used to coordinate RQLite node-to-node TLS startup with certificate provisioning certReady chan struct{} } @@ -87,583 +71,66 @@ func NewNode(cfg *config.Config) (*Node, error) { }, nil } -// startRQLite initializes and starts the RQLite database -func (n *Node) startRQLite(ctx context.Context) error { - n.logger.Info("Starting RQLite database") - - // Determine node identifier for log filename - use node ID for unique filenames - nodeID := n.config.Node.ID - if nodeID == "" { - // Default to "node" if ID is not set - nodeID = "node" - } - - // Create RQLite manager - n.rqliteManager = database.NewRQLiteManager(&n.config.Database, &n.config.Discovery, n.config.Node.DataDir, n.logger.Logger) - n.rqliteManager.SetNodeType(nodeID) - - // Initialize cluster discovery service if LibP2P host is available - if n.host != nil && n.discoveryManager != nil { - // Create cluster discovery service (all nodes are unified) - n.clusterDiscovery = database.NewClusterDiscoveryService( - n.host, - n.discoveryManager, - n.rqliteManager, - n.config.Node.ID, - "node", // Unified node type - n.config.Discovery.RaftAdvAddress, - n.config.Discovery.HttpAdvAddress, - n.config.Node.DataDir, - n.logger.Logger, - ) - - // Set discovery service on RQLite manager BEFORE starting RQLite - // This is critical for pre-start cluster discovery during recovery - n.rqliteManager.SetDiscoveryService(n.clusterDiscovery) - - // Start cluster discovery (but don't trigger initial sync yet) - if err := n.clusterDiscovery.Start(ctx); err != nil { - return fmt.Errorf("failed to start cluster discovery: %w", err) - } - - // Publish initial metadata (with log_index=0) so peers can discover us during recovery - // The metadata will be updated with actual log index after RQLite starts - n.clusterDiscovery.UpdateOwnMetadata() - - n.logger.Info("Cluster discovery service started (waiting for RQLite)") - } - - // If node-to-node TLS is configured, wait for certificates to be provisioned - // This ensures RQLite can start with TLS when joining through the SNI gateway - if n.config.Database.NodeCert != "" && n.config.Database.NodeKey != "" && n.certReady != nil { - n.logger.Info("RQLite node TLS configured, waiting for certificates to be provisioned...", - zap.String("node_cert", n.config.Database.NodeCert), - zap.String("node_key", n.config.Database.NodeKey)) - - // Wait for certificate ready signal with timeout - certTimeout := 5 * time.Minute - select { - case <-n.certReady: - n.logger.Info("Certificates ready, proceeding with RQLite startup") - case <-time.After(certTimeout): - return fmt.Errorf("timeout waiting for TLS certificates after %v - ensure HTTPS is configured and ports 80/443 are accessible for ACME challenges", certTimeout) - case <-ctx.Done(): - return fmt.Errorf("context cancelled while waiting for certificates: %w", ctx.Err()) - } - } - - // Start RQLite FIRST before updating metadata - if err := n.rqliteManager.Start(ctx); err != nil { - return err - } - - // NOW update metadata after RQLite is running - if n.clusterDiscovery != nil { - n.clusterDiscovery.UpdateOwnMetadata() - n.clusterDiscovery.TriggerSync() // Do initial cluster sync now that RQLite is ready - n.logger.Info("RQLite metadata published and cluster synced") - } - - // Create adapter for sql.DB compatibility - adapter, err := database.NewRQLiteAdapter(n.rqliteManager) - if err != nil { - return fmt.Errorf("failed to create RQLite adapter: %w", err) - } - n.rqliteAdapter = adapter - - return nil -} - -// extractIPFromMultiaddr extracts the IP address from a peer multiaddr -// Supports IP4, IP6, DNS4, DNS6, and DNSADDR protocols -func extractIPFromMultiaddr(multiaddrStr string) string { - ma, err := multiaddr.NewMultiaddr(multiaddrStr) - if err != nil { - return "" - } - - // First, try to extract direct IP address - var ip string - var dnsName string - multiaddr.ForEach(ma, func(c multiaddr.Component) bool { - switch c.Protocol().Code { - case multiaddr.P_IP4, multiaddr.P_IP6: - ip = c.Value() - return false // Stop iteration - found IP - case multiaddr.P_DNS4, multiaddr.P_DNS6, multiaddr.P_DNSADDR: - dnsName = c.Value() - // Continue to check for IP, but remember DNS name as fallback - } - return true - }) - - // If we found a direct IP, return it - if ip != "" { - return ip - } - - // If we found a DNS name, try to resolve it - if dnsName != "" { - if resolvedIPs, err := net.LookupIP(dnsName); err == nil && len(resolvedIPs) > 0 { - // Prefer IPv4 addresses, but accept IPv6 if that's all we have - for _, resolvedIP := range resolvedIPs { - if resolvedIP.To4() != nil { - return resolvedIP.String() - } - } - // Return first IPv6 address if no IPv4 found - return resolvedIPs[0].String() - } - } - - return "" -} - -// peerSource returns a PeerSource that yields peers from configured peers. -func peerSource(peerAddrs []string, logger *zap.Logger) func(context.Context, int) <-chan peer.AddrInfo { - return func(ctx context.Context, num int) <-chan peer.AddrInfo { - out := make(chan peer.AddrInfo, num) - go func() { - defer close(out) - count := 0 - for _, s := range peerAddrs { - if count >= num { - return - } - ma, err := multiaddr.NewMultiaddr(s) - if err != nil { - logger.Debug("invalid peer multiaddr", zap.String("addr", s), zap.Error(err)) - continue - } - ai, err := peer.AddrInfoFromP2pAddr(ma) - if err != nil { - logger.Debug("failed to parse peer address", zap.String("addr", s), zap.Error(err)) - continue - } - select { - case out <- *ai: - count++ - case <-ctx.Done(): - return - } - } - }() - return out - } -} - -// hasPeerConnections checks if we're connected to any peers -func (n *Node) hasPeerConnections() bool { - if n.host == nil || len(n.config.Discovery.BootstrapPeers) == 0 { - return false - } - - connectedPeers := n.host.Network().Peers() - if len(connectedPeers) == 0 { - return false - } - - // Parse peer IDs - peerIDs := make(map[peer.ID]bool) - for _, peerAddr := range n.config.Discovery.BootstrapPeers { - ma, err := multiaddr.NewMultiaddr(peerAddr) - if err != nil { - continue - } - peerInfo, err := peer.AddrInfoFromP2pAddr(ma) - if err != nil { - continue - } - peerIDs[peerInfo.ID] = true - } - - // Check if any connected peer is in our peer list - for _, peerID := range connectedPeers { - if peerIDs[peerID] { - return true - } - } - - return false -} - -// calculateNextBackoff calculates the next backoff interval with exponential growth -func calculateNextBackoff(current time.Duration) time.Duration { - // Multiply by 1.5 for gentler exponential growth - next := time.Duration(float64(current) * 1.5) - // Cap at 10 minutes - maxInterval := 10 * time.Minute - if next > maxInterval { - next = maxInterval - } - return next -} - -// addJitter adds random jitter to prevent thundering herd -func addJitter(interval time.Duration) time.Duration { - // Add ±20% jitter - jitterPercent := 0.2 - jitterRange := float64(interval) * jitterPercent - jitter := (mathrand.Float64() - 0.5) * 2 * jitterRange // -jitterRange to +jitterRange - - result := time.Duration(float64(interval) + jitter) - // Ensure we don't go below 1 second - if result < time.Second { - result = time.Second - } - return result -} - -// connectToPeerAddr connects to a single peer address -func (n *Node) connectToPeerAddr(ctx context.Context, addr string) error { - ma, err := multiaddr.NewMultiaddr(addr) - if err != nil { - return fmt.Errorf("invalid multiaddr: %w", err) - } - - // Extract peer info from multiaddr - peerInfo, err := peer.AddrInfoFromP2pAddr(ma) - if err != nil { - return fmt.Errorf("failed to extract peer info: %w", err) - } - - // Avoid dialing ourselves: if the address resolves to our own peer ID, skip. - if n.host != nil && peerInfo.ID == n.host.ID() { - n.logger.ComponentDebug(logging.ComponentNode, "Skipping peer address because it resolves to self", - zap.String("addr", addr), - zap.String("peer_id", peerInfo.ID.String())) - return nil - } - - // Log resolved peer info prior to connect - n.logger.ComponentDebug(logging.ComponentNode, "Resolved peer", - zap.String("peer_id", peerInfo.ID.String()), - zap.String("addr", addr), - zap.Int("addr_count", len(peerInfo.Addrs)), - ) - - // Connect to the peer - if err := n.host.Connect(ctx, *peerInfo); err != nil { - return fmt.Errorf("failed to connect to peer: %w", err) - } - - n.logger.Info("Connected to peer", - zap.String("peer", peerInfo.ID.String()), - zap.String("addr", addr)) - - return nil -} - -// connectToPeers connects to configured LibP2P peers -func (n *Node) connectToPeers(ctx context.Context) error { - if len(n.config.Discovery.BootstrapPeers) == 0 { - n.logger.ComponentDebug(logging.ComponentNode, "No peers configured") - return nil - } - - // Use passed context with a reasonable timeout for peer connections - connectCtx, cancel := context.WithTimeout(ctx, 30*time.Second) - defer cancel() - - for _, peerAddr := range n.config.Discovery.BootstrapPeers { - if err := n.connectToPeerAddr(connectCtx, peerAddr); err != nil { - n.logger.ComponentWarn(logging.ComponentNode, "Failed to connect to peer", - zap.String("addr", peerAddr), - zap.Error(err)) - continue - } - } - - return nil -} - -// startLibP2P initializes the LibP2P host -func (n *Node) startLibP2P() error { - n.logger.ComponentInfo(logging.ComponentLibP2P, "Starting LibP2P host") - - // Load or create persistent identity - identity, err := n.loadOrCreateIdentity() - if err != nil { - return fmt.Errorf("failed to load identity: %w", err) - } - - // Create LibP2P host with explicit listen addresses - var opts []libp2p.Option - opts = append(opts, - libp2p.Identity(identity), - libp2p.Security(noise.ID, noise.New), - libp2p.DefaultMuxers, - ) - - // Add explicit listen addresses from config - if len(n.config.Node.ListenAddresses) > 0 { - listenAddrs := make([]multiaddr.Multiaddr, 0, len(n.config.Node.ListenAddresses)) - for _, addr := range n.config.Node.ListenAddresses { - ma, err := multiaddr.NewMultiaddr(addr) - if err != nil { - return fmt.Errorf("invalid listen address %s: %w", addr, err) - } - listenAddrs = append(listenAddrs, ma) - } - opts = append(opts, libp2p.ListenAddrs(listenAddrs...)) - n.logger.ComponentInfo(logging.ComponentLibP2P, "Configured listen addresses", - zap.Strings("addrs", n.config.Node.ListenAddresses)) - } - - // For localhost/development, disable NAT services - // For production, these would be enabled - isLocalhost := len(n.config.Node.ListenAddresses) > 0 && - (strings.Contains(n.config.Node.ListenAddresses[0], "localhost") || - strings.Contains(n.config.Node.ListenAddresses[0], "127.0.0.1")) - - if isLocalhost { - n.logger.ComponentInfo(logging.ComponentLibP2P, "Localhost detected - disabling NAT services for local development") - // Don't add NAT/AutoRelay options for localhost - } else { - n.logger.ComponentInfo(logging.ComponentLibP2P, "Production mode - enabling NAT services") - opts = append(opts, - libp2p.EnableNATService(), - libp2p.EnableAutoNATv2(), - libp2p.EnableRelay(), - libp2p.NATPortMap(), - libp2p.EnableAutoRelayWithPeerSource( - peerSource(n.config.Discovery.BootstrapPeers, n.logger.Logger), - ), - ) - } - - h, err := libp2p.New(opts...) - if err != nil { - return err - } - - n.host = h - - // Initialize pubsub - ps, err := libp2ppubsub.NewGossipSub(context.Background(), h, - libp2ppubsub.WithPeerExchange(true), - libp2ppubsub.WithFloodPublish(true), // Ensure messages reach all peers, not just mesh - libp2ppubsub.WithDirectPeers(nil), // Enable direct peer connections - ) - if err != nil { - return fmt.Errorf("failed to create pubsub: %w", err) - } - - // Create pubsub adapter with "node" namespace - n.pubsub = pubsub.NewClientAdapter(ps, n.config.Discovery.NodeNamespace) - n.logger.Info("Initialized pubsub adapter on namespace", zap.String("namespace", n.config.Discovery.NodeNamespace)) - - // Log configured peers - if len(n.config.Discovery.BootstrapPeers) > 0 { - n.logger.ComponentInfo(logging.ComponentNode, "Configured peers", - zap.Strings("peers", n.config.Discovery.BootstrapPeers)) - } else { - n.logger.ComponentDebug(logging.ComponentNode, "No peers configured") - } - - // Connect to LibP2P peers if configured - if err := n.connectToPeers(context.Background()); err != nil { - n.logger.ComponentWarn(logging.ComponentNode, "Failed to connect to peers", zap.Error(err)) - // Don't fail - continue without peer connections - } - - // Start exponential backoff reconnection for peers - if len(n.config.Discovery.BootstrapPeers) > 0 { - peerCtx, cancel := context.WithCancel(context.Background()) - n.peerDiscoveryCancel = cancel - - go func() { - interval := 5 * time.Second - consecutiveFailures := 0 - - n.logger.ComponentInfo(logging.ComponentNode, "Starting peer reconnection with exponential backoff", - zap.Duration("initial_interval", interval), - zap.Duration("max_interval", 10*time.Minute)) - - for { - select { - case <-peerCtx.Done(): - n.logger.ComponentDebug(logging.ComponentNode, "Peer reconnection loop stopped") - return - default: - } - - // Check if we need to attempt connection - if !n.hasPeerConnections() { - n.logger.ComponentDebug(logging.ComponentNode, "Attempting peer connection", - zap.Duration("current_interval", interval), - zap.Int("consecutive_failures", consecutiveFailures)) - - if err := n.connectToPeers(context.Background()); err != nil { - consecutiveFailures++ - // Calculate next backoff interval - jitteredInterval := addJitter(interval) - n.logger.ComponentDebug(logging.ComponentNode, "Peer connection failed, backing off", - zap.Error(err), - zap.Duration("next_attempt_in", jitteredInterval), - zap.Int("consecutive_failures", consecutiveFailures)) - - // Sleep with jitter - select { - case <-peerCtx.Done(): - return - case <-time.After(jitteredInterval): - } - - // Increase interval for next attempt - interval = calculateNextBackoff(interval) - - // Log interval increases occasionally to show progress - if consecutiveFailures%5 == 0 { - n.logger.ComponentInfo(logging.ComponentNode, "Peer connection still failing", - zap.Int("consecutive_failures", consecutiveFailures), - zap.Duration("current_interval", interval)) - } - } else { - // Success! Reset interval and counters - if consecutiveFailures > 0 { - n.logger.ComponentInfo(logging.ComponentNode, "Successfully connected to peers", - zap.Int("failures_overcome", consecutiveFailures)) - } - interval = 5 * time.Second - consecutiveFailures = 0 - - // Wait 30 seconds before checking connection again - select { - case <-peerCtx.Done(): - return - case <-time.After(30 * time.Second): - } - } - } else { - // We have peer connections, just wait and check periodically - select { - case <-peerCtx.Done(): - return - case <-time.After(30 * time.Second): - } - } - } - }() - } - - // Add peers to peerstore for peer exchange - if len(n.config.Discovery.BootstrapPeers) > 0 { - n.logger.ComponentInfo(logging.ComponentNode, "Adding peers to peerstore") - for _, peerAddr := range n.config.Discovery.BootstrapPeers { - if ma, err := multiaddr.NewMultiaddr(peerAddr); err == nil { - if peerInfo, err := peer.AddrInfoFromP2pAddr(ma); err == nil { - // Add to peerstore with longer TTL for peer exchange - n.host.Peerstore().AddAddrs(peerInfo.ID, peerInfo.Addrs, time.Hour*24) - n.logger.ComponentDebug(logging.ComponentNode, "Added peer to peerstore", - zap.String("peer", peerInfo.ID.String())) - } - } - } - } - - // Initialize discovery manager with peer exchange protocol - n.discoveryManager = discovery.NewManager(h, nil, n.logger.Logger) - n.discoveryManager.StartProtocolHandler() - - n.logger.ComponentInfo(logging.ComponentNode, "LibP2P host started successfully - using active peer exchange discovery") - - // Start peer discovery and monitoring - n.startPeerDiscovery() - - n.logger.ComponentInfo(logging.ComponentLibP2P, "LibP2P host started", - zap.String("peer_id", h.ID().String())) - - return nil -} - -// loadOrCreateIdentity loads an existing identity or creates a new one -// loadOrCreateIdentity loads an existing identity or creates a new one -func (n *Node) loadOrCreateIdentity() (crypto.PrivKey, error) { - identityFile := filepath.Join(n.config.Node.DataDir, "identity.key") +// Start starts the network node and all its services +func (n *Node) Start(ctx context.Context) error { + n.logger.Info("Starting network node", zap.String("data_dir", n.config.Node.DataDir)) // Expand ~ in data directory path - identityFile = os.ExpandEnv(identityFile) - if strings.HasPrefix(identityFile, "~") { + dataDir := n.config.Node.DataDir + dataDir = os.ExpandEnv(dataDir) + if strings.HasPrefix(dataDir, "~") { home, err := os.UserHomeDir() if err != nil { - return nil, fmt.Errorf("failed to determine home directory: %w", err) + return fmt.Errorf("failed to determine home directory: %w", err) } - identityFile = filepath.Join(home, identityFile[1:]) + dataDir = filepath.Join(home, dataDir[1:]) } - // Try to load existing identity using the shared package - if _, err := os.Stat(identityFile); err == nil { - info, err := encryption.LoadIdentity(identityFile) - if err != nil { - n.logger.Warn("Failed to load existing identity, creating new one", zap.Error(err)) - } else { - n.logger.ComponentInfo(logging.ComponentNode, "Loaded existing identity", - zap.String("file", identityFile), - zap.String("peer_id", info.PeerID.String())) - return info.PrivateKey, nil + // Create data directory + if err := os.MkdirAll(dataDir, 0755); err != nil { + return fmt.Errorf("failed to create data directory: %w", err) + } + + // Start HTTP Gateway first (doesn't depend on other services) + if err := n.startHTTPGateway(ctx); err != nil { + n.logger.ComponentWarn(logging.ComponentNode, "Failed to start HTTP Gateway", zap.Error(err)) + } + + // Start LibP2P host first (needed for cluster discovery) + if err := n.startLibP2P(); err != nil { + return fmt.Errorf("failed to start LibP2P: %w", err) + } + + // Initialize IPFS Cluster configuration if enabled + if n.config.Database.IPFS.ClusterAPIURL != "" { + if err := n.startIPFSClusterConfig(); err != nil { + n.logger.ComponentWarn(logging.ComponentNode, "Failed to initialize IPFS Cluster config", zap.Error(err)) } } - // Create new identity using shared package - n.logger.Info("Creating new identity", zap.String("file", identityFile)) - info, err := encryption.GenerateIdentity() - if err != nil { - return nil, fmt.Errorf("failed to generate identity: %w", err) + // Start RQLite with cluster discovery + if err := n.startRQLite(ctx); err != nil { + return fmt.Errorf("failed to start RQLite: %w", err) } - // Save identity using shared package - if err := encryption.SaveIdentity(info, identityFile); err != nil { - return nil, fmt.Errorf("failed to save identity: %w", err) + // Get listen addresses for logging + var listenAddrs []string + if n.host != nil { + for _, addr := range n.host.Addrs() { + listenAddrs = append(listenAddrs, addr.String()) + } } - n.logger.Info("Identity saved", - zap.String("file", identityFile), - zap.String("peer_id", info.PeerID.String())) + n.logger.ComponentInfo(logging.ComponentNode, "Network node started successfully", + zap.String("peer_id", n.GetPeerID()), + zap.Strings("listen_addrs", listenAddrs), + ) - return info.PrivateKey, nil + n.startConnectionMonitoring() + + return nil } -// GetPeerID returns the peer ID of this node -func (n *Node) GetPeerID() string { - if n.host == nil { - return "" - } - return n.host.ID().String() -} - -// startPeerDiscovery starts periodic peer discovery for the node -func (n *Node) startPeerDiscovery() { - if n.discoveryManager == nil { - n.logger.ComponentWarn(logging.ComponentNode, "Discovery manager not initialized") - return - } - - // Start the discovery manager with config from node config - discoveryConfig := discovery.Config{ - DiscoveryInterval: n.config.Discovery.DiscoveryInterval, - MaxConnections: n.config.Node.MaxConnections, - } - - if err := n.discoveryManager.Start(discoveryConfig); err != nil { - n.logger.ComponentWarn(logging.ComponentNode, "Failed to start discovery manager", zap.Error(err)) - return - } - - n.logger.ComponentInfo(logging.ComponentNode, "Peer discovery manager started", - zap.Duration("interval", discoveryConfig.DiscoveryInterval), - zap.Int("max_connections", discoveryConfig.MaxConnections)) -} - -// stopPeerDiscovery stops peer discovery -func (n *Node) stopPeerDiscovery() { - if n.discoveryManager != nil { - n.discoveryManager.Stop() - } - n.logger.ComponentInfo(logging.ComponentNode, "Peer discovery stopped") -} - -// getListenAddresses returns the current listen addresses as strings // Stop stops the node and all its services func (n *Node) Stop() error { n.logger.ComponentInfo(logging.ComponentNode, "Stopping network node") @@ -716,550 +183,3 @@ func (n *Node) Stop() error { n.logger.ComponentInfo(logging.ComponentNode, "Network node stopped") return nil } - -// loadNodePeerIDFromIdentity safely loads the node's peer ID from its identity file -// This is needed before the host is initialized, so we read directly from the file -func loadNodePeerIDFromIdentity(dataDir string) string { - identityFile := filepath.Join(os.ExpandEnv(dataDir), "identity.key") - - // Expand ~ in path - if strings.HasPrefix(identityFile, "~") { - home, err := os.UserHomeDir() - if err != nil { - return "" - } - identityFile = filepath.Join(home, identityFile[1:]) - } - - // Load identity from file - if info, err := encryption.LoadIdentity(identityFile); err == nil { - return info.PeerID.String() - } - - return "" // Return empty string if can't load (gateway will work without it) -} - -// startHTTPGateway initializes and starts the full API gateway with auth, pubsub, and API endpoints -func (n *Node) startHTTPGateway(ctx context.Context) error { - if !n.config.HTTPGateway.Enabled { - n.logger.ComponentInfo(logging.ComponentNode, "HTTP Gateway disabled in config") - return nil - } - - // Create separate logger for gateway - logFile := filepath.Join(os.ExpandEnv(n.config.Node.DataDir), "..", "logs", "gateway.log") - - // Ensure logs directory exists - logsDir := filepath.Dir(logFile) - if err := os.MkdirAll(logsDir, 0755); err != nil { - return fmt.Errorf("failed to create logs directory: %w", err) - } - - gatewayLogger, err := logging.NewFileLogger(logging.ComponentGeneral, logFile, false) - if err != nil { - return fmt.Errorf("failed to create gateway logger: %w", err) - } - - // Create full API Gateway for auth, pubsub, rqlite, and API endpoints - // This replaces both the old reverse proxy gateway and the standalone gateway - gwCfg := &gateway.Config{ - ListenAddr: n.config.HTTPGateway.ListenAddr, - ClientNamespace: n.config.HTTPGateway.ClientNamespace, - BootstrapPeers: n.config.Discovery.BootstrapPeers, - NodePeerID: loadNodePeerIDFromIdentity(n.config.Node.DataDir), // Load the node's actual peer ID from its identity file - RQLiteDSN: n.config.HTTPGateway.RQLiteDSN, - OlricServers: n.config.HTTPGateway.OlricServers, - OlricTimeout: n.config.HTTPGateway.OlricTimeout, - IPFSClusterAPIURL: n.config.HTTPGateway.IPFSClusterAPIURL, - IPFSAPIURL: n.config.HTTPGateway.IPFSAPIURL, - IPFSTimeout: n.config.HTTPGateway.IPFSTimeout, - // HTTPS/TLS configuration - EnableHTTPS: n.config.HTTPGateway.HTTPS.Enabled, - DomainName: n.config.HTTPGateway.HTTPS.Domain, - TLSCacheDir: n.config.HTTPGateway.HTTPS.CacheDir, - } - - apiGateway, err := gateway.New(gatewayLogger, gwCfg) - if err != nil { - return fmt.Errorf("failed to create full API gateway: %w", err) - } - - n.apiGateway = apiGateway - - // Check if HTTPS is enabled and set up certManager BEFORE starting goroutine - // This ensures n.certManager is set before SNI gateway initialization checks it - var certManager *autocert.Manager - var tlsCacheDir string - if gwCfg.EnableHTTPS && gwCfg.DomainName != "" { - tlsCacheDir = gwCfg.TLSCacheDir - if tlsCacheDir == "" { - tlsCacheDir = "/home/debros/.orama/tls-cache" - } - - // Ensure TLS cache directory exists and is writable - if err := os.MkdirAll(tlsCacheDir, 0700); err != nil { - n.logger.ComponentWarn(logging.ComponentNode, "Failed to create TLS cache directory", - zap.String("dir", tlsCacheDir), - zap.Error(err), - ) - } else { - n.logger.ComponentInfo(logging.ComponentNode, "TLS cache directory ready", - zap.String("dir", tlsCacheDir), - ) - } - - // Create TLS configuration with Let's Encrypt autocert - // Using STAGING environment to avoid rate limits during development/testing - // TODO: Switch to production when ready (remove Client field) - certManager = &autocert.Manager{ - Prompt: autocert.AcceptTOS, - HostPolicy: autocert.HostWhitelist(gwCfg.DomainName), - Cache: autocert.DirCache(tlsCacheDir), - Email: fmt.Sprintf("admin@%s", gwCfg.DomainName), - Client: &acme.Client{ - DirectoryURL: "https://acme-staging-v02.api.letsencrypt.org/directory", - }, - } - - // Store certificate manager for use by SNI gateway - n.certManager = certManager - - // Initialize certificate ready channel - will be closed when certs are extracted - // This allows RQLite to wait for certificates before starting with node TLS - n.certReady = make(chan struct{}) - } - - // Channel to signal when HTTP server is ready for ACME challenges - httpReady := make(chan struct{}) - - // Start API Gateway in a goroutine - go func() { - gatewayLogger.ComponentInfo(logging.ComponentGateway, "Starting full API gateway", - zap.String("listen_addr", gwCfg.ListenAddr), - ) - - // Check if HTTPS is enabled - if gwCfg.EnableHTTPS && gwCfg.DomainName != "" && certManager != nil { - // Start HTTPS server with automatic certificate provisioning - gatewayLogger.ComponentInfo(logging.ComponentGateway, "HTTPS enabled, starting secure gateway", - zap.String("domain", gwCfg.DomainName), - ) - - // Determine HTTPS and HTTP ports - httpsPort := 443 - httpPort := 80 - - // Start HTTP server for ACME challenges and redirects - // certManager.HTTPHandler() must be the main handler, with a fallback for other requests - httpServer := &http.Server{ - Addr: fmt.Sprintf(":%d", httpPort), - Handler: certManager.HTTPHandler(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - // Fallback for non-ACME requests: redirect to HTTPS - target := fmt.Sprintf("https://%s%s", r.Host, r.URL.RequestURI()) - http.Redirect(w, r, target, http.StatusMovedPermanently) - })), - } - - // Create HTTP listener first to ensure port 80 is bound before signaling ready - gatewayLogger.ComponentInfo(logging.ComponentGateway, "Binding HTTP listener for ACME challenges", - zap.Int("port", httpPort), - ) - httpListener, err := net.Listen("tcp", fmt.Sprintf(":%d", httpPort)) - if err != nil { - gatewayLogger.ComponentError(logging.ComponentGateway, "failed to bind HTTP listener for ACME", zap.Error(err)) - close(httpReady) // Signal even on failure so SNI goroutine doesn't hang - return - } - gatewayLogger.ComponentInfo(logging.ComponentGateway, "HTTP server ready for ACME challenges", - zap.Int("port", httpPort), - zap.String("tls_cache_dir", tlsCacheDir), - ) - - // Start HTTP server in background for ACME challenges - go func() { - gatewayLogger.ComponentInfo(logging.ComponentGateway, "HTTP server serving ACME challenges", - zap.String("addr", httpServer.Addr), - ) - if err := httpServer.Serve(httpListener); err != nil && err != http.ErrServerClosed { - gatewayLogger.ComponentError(logging.ComponentGateway, "HTTP server error", zap.Error(err)) - } - }() - - // Pre-provision the certificate BEFORE starting HTTPS server - // This ensures we don't accept HTTPS connections without a valid certificate - gatewayLogger.ComponentInfo(logging.ComponentGateway, "Pre-provisioning TLS certificate...", - zap.String("domain", gwCfg.DomainName), - ) - - // Use a timeout context for certificate provisioning - // If Let's Encrypt is rate-limited or unreachable, don't block forever - certCtx, certCancel := context.WithTimeout(context.Background(), 30*time.Second) - defer certCancel() - - certReq := &tls.ClientHelloInfo{ - ServerName: gwCfg.DomainName, - } - - gatewayLogger.ComponentInfo(logging.ComponentGateway, "Initiating certificate request to Let's Encrypt", - zap.String("domain", gwCfg.DomainName), - zap.String("acme_environment", "staging"), - ) - - // Try to get certificate with timeout - certProvisionChan := make(chan error, 1) - go func() { - gatewayLogger.ComponentInfo(logging.ComponentGateway, "GetCertificate goroutine started") - _, err := certManager.GetCertificate(certReq) - if err != nil { - gatewayLogger.ComponentError(logging.ComponentGateway, "GetCertificate returned error", - zap.Error(err), - ) - } else { - gatewayLogger.ComponentInfo(logging.ComponentGateway, "GetCertificate succeeded") - } - certProvisionChan <- err - }() - - var certErr error - select { - case err := <-certProvisionChan: - certErr = err - if certErr != nil { - gatewayLogger.ComponentError(logging.ComponentGateway, "Certificate provisioning failed", - zap.String("domain", gwCfg.DomainName), - zap.Error(certErr), - ) - } - case <-certCtx.Done(): - certErr = fmt.Errorf("certificate provisioning timeout (Let's Encrypt may be rate-limited or unreachable)") - gatewayLogger.ComponentError(logging.ComponentGateway, "Certificate provisioning timeout", - zap.String("domain", gwCfg.DomainName), - zap.Duration("timeout", 30*time.Second), - zap.Error(certErr), - ) - } - - if certErr != nil { - gatewayLogger.ComponentError(logging.ComponentGateway, "Failed to provision TLS certificate - HTTPS disabled", - zap.String("domain", gwCfg.DomainName), - zap.Error(certErr), - zap.String("http_server_status", "running on port 80 for HTTP fallback"), - ) - // Signal ready for SNI goroutine (even though we're failing) - close(httpReady) - - // HTTP server on port 80 is already running, but it's configured to redirect to HTTPS - // Replace its handler to serve the gateway directly instead of redirecting - httpServer.Handler = apiGateway.Routes() - - gatewayLogger.ComponentInfo(logging.ComponentGateway, "HTTP gateway available on port 80 only", - zap.String("port", "80"), - ) - return - } - - gatewayLogger.ComponentInfo(logging.ComponentGateway, "TLS certificate provisioned successfully", - zap.String("domain", gwCfg.DomainName), - ) - - // Signal that HTTP server is ready for ACME challenges - close(httpReady) - - tlsConfig := &tls.Config{ - MinVersion: tls.VersionTLS12, - GetCertificate: certManager.GetCertificate, - } - - // Start HTTPS server - httpsServer := &http.Server{ - Addr: fmt.Sprintf(":%d", httpsPort), - TLSConfig: tlsConfig, - Handler: apiGateway.Routes(), - } - - n.apiGatewayServer = httpsServer - - listener, err := tls.Listen("tcp", fmt.Sprintf(":%d", httpsPort), tlsConfig) - if err != nil { - gatewayLogger.ComponentError(logging.ComponentGateway, "failed to create TLS listener", zap.Error(err)) - return - } - - gatewayLogger.ComponentInfo(logging.ComponentGateway, "HTTPS gateway listener bound", - zap.String("domain", gwCfg.DomainName), - zap.Int("port", httpsPort), - ) - - // Serve HTTPS - if err := httpsServer.Serve(listener); err != nil && err != http.ErrServerClosed { - gatewayLogger.ComponentError(logging.ComponentGateway, "HTTPS Gateway error", zap.Error(err)) - } - } else { - // No HTTPS - signal ready immediately (no ACME needed) - close(httpReady) - - // Start plain HTTP server - server := &http.Server{ - Addr: gwCfg.ListenAddr, - Handler: apiGateway.Routes(), - } - - n.apiGatewayServer = server - - // Try to bind listener - ln, err := net.Listen("tcp", gwCfg.ListenAddr) - if err != nil { - gatewayLogger.ComponentError(logging.ComponentGateway, "failed to bind API gateway listener", zap.Error(err)) - return - } - - gatewayLogger.ComponentInfo(logging.ComponentGateway, "API gateway listener bound", zap.String("listen_addr", ln.Addr().String())) - - // Serve HTTP - if err := server.Serve(ln); err != nil && err != http.ErrServerClosed { - gatewayLogger.ComponentError(logging.ComponentGateway, "API Gateway error", zap.Error(err)) - } - } - }() - - // Initialize and start SNI gateway if HTTPS is enabled and SNI is configured - // This runs in a separate goroutine that waits for HTTP server to be ready - if n.config.HTTPGateway.SNI.Enabled && n.certManager != nil { - go func() { - // Wait for HTTP server to be ready for ACME challenges - gatewayLogger.ComponentInfo(logging.ComponentGateway, "Waiting for HTTP server before SNI initialization...") - <-httpReady - - gatewayLogger.ComponentInfo(logging.ComponentGateway, "Initializing SNI gateway", - zap.String("listen_addr", n.config.HTTPGateway.SNI.ListenAddr), - ) - - // Provision the certificate from Let's Encrypt cache - // This ensures the certificate file is downloaded and cached - domain := n.config.HTTPGateway.HTTPS.Domain - if domain != "" { - gatewayLogger.ComponentInfo(logging.ComponentGateway, "Provisioning certificate for SNI", - zap.String("domain", domain)) - - certReq := &tls.ClientHelloInfo{ - ServerName: domain, - } - if tlsCert, err := n.certManager.GetCertificate(certReq); err != nil { - gatewayLogger.ComponentError(logging.ComponentGateway, "Failed to provision certificate for SNI", - zap.String("domain", domain), zap.Error(err)) - return // Can't start SNI without certificate - } else { - gatewayLogger.ComponentInfo(logging.ComponentGateway, "Certificate provisioned for SNI", - zap.String("domain", domain)) - - // Extract certificate to PEM files for SNI gateway - // SNI gateway needs standard PEM cert files, not autocert cache format - tlsCacheDir := n.config.HTTPGateway.HTTPS.CacheDir - if tlsCacheDir == "" { - tlsCacheDir = "/home/debros/.orama/tls-cache" - } - - certPath := filepath.Join(tlsCacheDir, domain+".crt") - keyPath := filepath.Join(tlsCacheDir, domain+".key") - - if err := extractPEMFromTLSCert(tlsCert, certPath, keyPath); err != nil { - gatewayLogger.ComponentError(logging.ComponentGateway, "Failed to extract PEM from TLS cert for SNI", - zap.Error(err)) - return // Can't start SNI without PEM files - } - gatewayLogger.ComponentInfo(logging.ComponentGateway, "PEM certificates extracted for SNI", - zap.String("cert_path", certPath), zap.String("key_path", keyPath)) - - // Signal that certificates are ready for RQLite node-to-node TLS - if n.certReady != nil { - close(n.certReady) - gatewayLogger.ComponentInfo(logging.ComponentGateway, "Certificate ready signal sent for RQLite node TLS") - } - } - } else { - gatewayLogger.ComponentError(logging.ComponentGateway, "No domain configured for SNI certificate") - return - } - - // Create SNI config with certificate files - sniCfg := n.config.HTTPGateway.SNI - - // Use the same gateway logger for SNI gateway (writes to gateway.log) - sniGateway, err := gateway.NewTCPSNIGateway(gatewayLogger, &sniCfg) - if err != nil { - gatewayLogger.ComponentError(logging.ComponentGateway, "Failed to initialize SNI gateway", zap.Error(err)) - return - } - - n.sniGateway = sniGateway - gatewayLogger.ComponentInfo(logging.ComponentGateway, "SNI gateway initialized, starting...") - - // Start SNI gateway (this blocks until shutdown) - if err := n.sniGateway.Start(ctx); err != nil { - gatewayLogger.ComponentError(logging.ComponentGateway, "SNI Gateway error", zap.Error(err)) - } - }() - } - - return nil -} - -// extractPEMFromTLSCert extracts certificate and private key from tls.Certificate to PEM files -func extractPEMFromTLSCert(tlsCert *tls.Certificate, certPath, keyPath string) error { - if tlsCert == nil || len(tlsCert.Certificate) == 0 { - return fmt.Errorf("invalid tls certificate") - } - - // Write certificate chain to PEM file - certFile, err := os.Create(certPath) - if err != nil { - return fmt.Errorf("failed to create cert file: %w", err) - } - defer certFile.Close() - - // Write all certificates in the chain - for _, certBytes := range tlsCert.Certificate { - if err := pem.Encode(certFile, &pem.Block{ - Type: "CERTIFICATE", - Bytes: certBytes, - }); err != nil { - return fmt.Errorf("failed to encode certificate: %w", err) - } - } - - // Write private key to PEM file - if tlsCert.PrivateKey == nil { - return fmt.Errorf("private key is nil") - } - - keyFile, err := os.Create(keyPath) - if err != nil { - return fmt.Errorf("failed to create key file: %w", err) - } - defer keyFile.Close() - - // Handle different key types - var keyBytes []byte - switch key := tlsCert.PrivateKey.(type) { - case *x509.Certificate: - keyBytes, err = x509.MarshalPKCS8PrivateKey(key) - if err != nil { - return fmt.Errorf("failed to marshal private key: %w", err) - } - default: - // Try to marshal as PKCS8 - keyBytes, err = x509.MarshalPKCS8PrivateKey(tlsCert.PrivateKey) - if err != nil { - return fmt.Errorf("failed to marshal private key: %w", err) - } - } - - if err := pem.Encode(keyFile, &pem.Block{ - Type: "PRIVATE KEY", - Bytes: keyBytes, - }); err != nil { - return fmt.Errorf("failed to encode private key: %w", err) - } - - // Set proper permissions - os.Chmod(certPath, 0644) - os.Chmod(keyPath, 0600) - - return nil -} - -// Starts the network node -func (n *Node) Start(ctx context.Context) error { - n.logger.Info("Starting network node", zap.String("data_dir", n.config.Node.DataDir)) - - // Expand ~ in data directory path - dataDir := n.config.Node.DataDir - dataDir = os.ExpandEnv(dataDir) - if strings.HasPrefix(dataDir, "~") { - home, err := os.UserHomeDir() - if err != nil { - return fmt.Errorf("failed to determine home directory: %w", err) - } - dataDir = filepath.Join(home, dataDir[1:]) - } - - // Create data directory - if err := os.MkdirAll(dataDir, 0755); err != nil { - return fmt.Errorf("failed to create data directory: %w", err) - } - - // Start HTTP Gateway first (doesn't depend on other services) - if err := n.startHTTPGateway(ctx); err != nil { - n.logger.ComponentWarn(logging.ComponentNode, "Failed to start HTTP Gateway", zap.Error(err)) - // Don't fail node startup if gateway fails - } - - // Start LibP2P host first (needed for cluster discovery) - if err := n.startLibP2P(); err != nil { - return fmt.Errorf("failed to start LibP2P: %w", err) - } - - // Initialize IPFS Cluster configuration if enabled - if n.config.Database.IPFS.ClusterAPIURL != "" { - if err := n.startIPFSClusterConfig(); err != nil { - n.logger.ComponentWarn(logging.ComponentNode, "Failed to initialize IPFS Cluster config", zap.Error(err)) - // Don't fail node startup if cluster config fails - } - } - - // Start RQLite with cluster discovery - if err := n.startRQLite(ctx); err != nil { - return fmt.Errorf("failed to start RQLite: %w", err) - } - - // Get listen addresses for logging - var listenAddrs []string - for _, addr := range n.host.Addrs() { - listenAddrs = append(listenAddrs, addr.String()) - } - - n.logger.ComponentInfo(logging.ComponentNode, "Network node started successfully", - zap.String("peer_id", n.host.ID().String()), - zap.Strings("listen_addrs", listenAddrs), - ) - - n.startConnectionMonitoring() - - return nil -} - -// startIPFSClusterConfig initializes and ensures IPFS Cluster configuration -func (n *Node) startIPFSClusterConfig() error { - n.logger.ComponentInfo(logging.ComponentNode, "Initializing IPFS Cluster configuration") - - // Create config manager - cm, err := ipfs.NewClusterConfigManager(n.config, n.logger.Logger) - if err != nil { - return fmt.Errorf("failed to create cluster config manager: %w", err) - } - n.clusterConfigManager = cm - - // Fix IPFS config addresses (localhost -> 127.0.0.1) before ensuring cluster config - if err := cm.FixIPFSConfigAddresses(); err != nil { - n.logger.ComponentWarn(logging.ComponentNode, "Failed to fix IPFS config addresses", zap.Error(err)) - // Don't fail startup if config fix fails - cluster config will handle it - } - - // Ensure configuration exists and is correct - if err := cm.EnsureConfig(); err != nil { - return fmt.Errorf("failed to ensure cluster config: %w", err) - } - - // Try to repair peer configuration automatically - // This will be retried periodically if peer is not available yet - if success, err := cm.RepairPeerConfiguration(); err != nil { - n.logger.ComponentWarn(logging.ComponentNode, "Failed to repair peer configuration, will retry later", zap.Error(err)) - } else if success { - n.logger.ComponentInfo(logging.ComponentNode, "Peer configuration repaired successfully") - } else { - n.logger.ComponentDebug(logging.ComponentNode, "Peer not available yet, will retry periodically") - } - - n.logger.ComponentInfo(logging.ComponentNode, "IPFS Cluster configuration initialized") - return nil -} diff --git a/pkg/node/rqlite.go b/pkg/node/rqlite.go new file mode 100644 index 0000000..8e5523d --- /dev/null +++ b/pkg/node/rqlite.go @@ -0,0 +1,98 @@ +package node + +import ( + "context" + "fmt" + + database "github.com/DeBrosOfficial/network/pkg/rqlite" + "go.uber.org/zap" + "time" +) + +// startRQLite initializes and starts the RQLite database +func (n *Node) startRQLite(ctx context.Context) error { + n.logger.Info("Starting RQLite database") + + // Determine node identifier for log filename - use node ID for unique filenames + nodeID := n.config.Node.ID + if nodeID == "" { + // Default to "node" if ID is not set + nodeID = "node" + } + + // Create RQLite manager + n.rqliteManager = database.NewRQLiteManager(&n.config.Database, &n.config.Discovery, n.config.Node.DataDir, n.logger.Logger) + n.rqliteManager.SetNodeType(nodeID) + + // Initialize cluster discovery service if LibP2P host is available + if n.host != nil && n.discoveryManager != nil { + // Create cluster discovery service (all nodes are unified) + n.clusterDiscovery = database.NewClusterDiscoveryService( + n.host, + n.discoveryManager, + n.rqliteManager, + n.config.Node.ID, + "node", // Unified node type + n.config.Discovery.RaftAdvAddress, + n.config.Discovery.HttpAdvAddress, + n.config.Node.DataDir, + n.logger.Logger, + ) + + // Set discovery service on RQLite manager BEFORE starting RQLite + // This is critical for pre-start cluster discovery during recovery + n.rqliteManager.SetDiscoveryService(n.clusterDiscovery) + + // Start cluster discovery (but don't trigger initial sync yet) + if err := n.clusterDiscovery.Start(ctx); err != nil { + return fmt.Errorf("failed to start cluster discovery: %w", err) + } + + // Publish initial metadata (with log_index=0) so peers can discover us during recovery + // The metadata will be updated with actual log index after RQLite starts + n.clusterDiscovery.UpdateOwnMetadata() + + n.logger.Info("Cluster discovery service started (waiting for RQLite)") + } + + // If node-to-node TLS is configured, wait for certificates to be provisioned + // This ensures RQLite can start with TLS when joining through the SNI gateway + if n.config.Database.NodeCert != "" && n.config.Database.NodeKey != "" && n.certReady != nil { + n.logger.Info("RQLite node TLS configured, waiting for certificates to be provisioned...", + zap.String("node_cert", n.config.Database.NodeCert), + zap.String("node_key", n.config.Database.NodeKey)) + + // Wait for certificate ready signal with timeout + certTimeout := 5 * time.Minute + select { + case <-n.certReady: + n.logger.Info("Certificates ready, proceeding with RQLite startup") + case <-time.After(certTimeout): + return fmt.Errorf("timeout waiting for TLS certificates after %v - ensure HTTPS is configured and ports 80/443 are accessible for ACME challenges", certTimeout) + case <-ctx.Done(): + return fmt.Errorf("context cancelled while waiting for certificates: %w", ctx.Err()) + } + } + + // Start RQLite FIRST before updating metadata + if err := n.rqliteManager.Start(ctx); err != nil { + return err + } + + // NOW update metadata after RQLite is running + if n.clusterDiscovery != nil { + n.clusterDiscovery.UpdateOwnMetadata() + n.clusterDiscovery.TriggerSync() // Do initial cluster sync now that RQLite is ready + n.logger.Info("RQLite metadata published and cluster synced") + } + + // Create adapter for sql.DB compatibility + adapter, err := database.NewRQLiteAdapter(n.rqliteManager) + if err != nil { + return fmt.Errorf("failed to create RQLite adapter: %w", err) + } + n.rqliteAdapter = adapter + + return nil +} + diff --git a/pkg/node/utils.go b/pkg/node/utils.go new file mode 100644 index 0000000..d9d366c --- /dev/null +++ b/pkg/node/utils.go @@ -0,0 +1,127 @@ +package node + +import ( + "crypto/tls" + "crypto/x509" + "encoding/pem" + "fmt" + mathrand "math/rand" + "net" + "os" + "path/filepath" + "strings" + "time" + + "github.com/DeBrosOfficial/network/pkg/encryption" + "github.com/multiformats/go-multiaddr" +) + +func extractIPFromMultiaddr(multiaddrStr string) string { + ma, err := multiaddr.NewMultiaddr(multiaddrStr) + if err != nil { + return "" + } + + var ip string + var dnsName string + multiaddr.ForEach(ma, func(c multiaddr.Component) bool { + switch c.Protocol().Code { + case multiaddr.P_IP4, multiaddr.P_IP6: + ip = c.Value() + return false + case multiaddr.P_DNS4, multiaddr.P_DNS6, multiaddr.P_DNSADDR: + dnsName = c.Value() + } + return true + }) + + if ip != "" { + return ip + } + + if dnsName != "" { + if resolvedIPs, err := net.LookupIP(dnsName); err == nil && len(resolvedIPs) > 0 { + for _, resolvedIP := range resolvedIPs { + if resolvedIP.To4() != nil { + return resolvedIP.String() + } + } + return resolvedIPs[0].String() + } + } + + return "" +} + +func calculateNextBackoff(current time.Duration) time.Duration { + next := time.Duration(float64(current) * 1.5) + maxInterval := 10 * time.Minute + if next > maxInterval { + next = maxInterval + } + return next +} + +func addJitter(interval time.Duration) time.Duration { + jitterPercent := 0.2 + jitterRange := float64(interval) * jitterPercent + jitter := (mathrand.Float64() - 0.5) * 2 * jitterRange + result := time.Duration(float64(interval) + jitter) + if result < time.Second { + result = time.Second + } + return result +} + +func loadNodePeerIDFromIdentity(dataDir string) string { + identityFile := filepath.Join(os.ExpandEnv(dataDir), "identity.key") + if strings.HasPrefix(identityFile, "~") { + home, _ := os.UserHomeDir() + identityFile = filepath.Join(home, identityFile[1:]) + } + + if info, err := encryption.LoadIdentity(identityFile); err == nil { + return info.PeerID.String() + } + return "" +} + +func extractPEMFromTLSCert(tlsCert *tls.Certificate, certPath, keyPath string) error { + if tlsCert == nil || len(tlsCert.Certificate) == 0 { + return fmt.Errorf("invalid tls certificate") + } + + certFile, err := os.Create(certPath) + if err != nil { + return err + } + defer certFile.Close() + + for _, certBytes := range tlsCert.Certificate { + pem.Encode(certFile, &pem.Block{Type: "CERTIFICATE", Bytes: certBytes}) + } + + if tlsCert.PrivateKey == nil { + return fmt.Errorf("private key is nil") + } + + keyFile, err := os.Create(keyPath) + if err != nil { + return err + } + defer keyFile.Close() + + var keyBytes []byte + switch key := tlsCert.PrivateKey.(type) { + case *x509.Certificate: + keyBytes, _ = x509.MarshalPKCS8PrivateKey(key) + default: + keyBytes, _ = x509.MarshalPKCS8PrivateKey(tlsCert.PrivateKey) + } + + pem.Encode(keyFile, &pem.Block{Type: "PRIVATE KEY", Bytes: keyBytes}) + os.Chmod(certPath, 0644) + os.Chmod(keyPath, 0600) + return nil +} + diff --git a/pkg/rqlite/cluster.go b/pkg/rqlite/cluster.go new file mode 100644 index 0000000..4b3b172 --- /dev/null +++ b/pkg/rqlite/cluster.go @@ -0,0 +1,301 @@ +package rqlite + +import ( + "context" + "encoding/json" + "fmt" + "net/http" + "os" + "path/filepath" + "strings" + "time" +) + +// establishLeadershipOrJoin handles post-startup cluster establishment +func (r *RQLiteManager) establishLeadershipOrJoin(ctx context.Context, rqliteDataDir string) error { + timeout := 5 * time.Minute + if r.config.RQLiteJoinAddress == "" { + timeout = 2 * time.Minute + } + + sqlCtx, cancel := context.WithTimeout(ctx, timeout) + defer cancel() + + if err := r.waitForSQLAvailable(sqlCtx); err != nil { + if r.cmd != nil && r.cmd.Process != nil { + _ = r.cmd.Process.Kill() + } + return err + } + + return nil +} + +// waitForMinClusterSizeBeforeStart waits for minimum cluster size to be discovered +func (r *RQLiteManager) waitForMinClusterSizeBeforeStart(ctx context.Context, rqliteDataDir string) error { + if r.discoveryService == nil { + return fmt.Errorf("discovery service not available") + } + + requiredRemotePeers := r.config.MinClusterSize - 1 + _ = r.discoveryService.TriggerPeerExchange(ctx) + + checkInterval := 2 * time.Second + for { + select { + case <-ctx.Done(): + return ctx.Err() + default: + } + + r.discoveryService.TriggerSync() + time.Sleep(checkInterval) + + allPeers := r.discoveryService.GetAllPeers() + remotePeerCount := 0 + for _, peer := range allPeers { + if peer.NodeID != r.discoverConfig.RaftAdvAddress { + remotePeerCount++ + } + } + + if remotePeerCount >= requiredRemotePeers { + peersPath := filepath.Join(rqliteDataDir, "raft", "peers.json") + r.discoveryService.TriggerSync() + time.Sleep(2 * time.Second) + + if info, err := os.Stat(peersPath); err == nil && info.Size() > 10 { + data, err := os.ReadFile(peersPath) + if err == nil { + var peers []map[string]interface{} + if err := json.Unmarshal(data, &peers); err == nil && len(peers) >= requiredRemotePeers { + return nil + } + } + } + } + } +} + +// performPreStartClusterDiscovery builds peers.json before starting RQLite +func (r *RQLiteManager) performPreStartClusterDiscovery(ctx context.Context, rqliteDataDir string) error { + if r.discoveryService == nil { + return fmt.Errorf("discovery service not available") + } + + _ = r.discoveryService.TriggerPeerExchange(ctx) + time.Sleep(1 * time.Second) + r.discoveryService.TriggerSync() + time.Sleep(2 * time.Second) + + discoveryDeadline := time.Now().Add(30 * time.Second) + var discoveredPeers int + + for time.Now().Before(discoveryDeadline) { + allPeers := r.discoveryService.GetAllPeers() + discoveredPeers = len(allPeers) + + if discoveredPeers >= r.config.MinClusterSize { + break + } + time.Sleep(2 * time.Second) + } + + if discoveredPeers <= 1 { + return nil + } + + if r.hasExistingRaftState(rqliteDataDir) { + ourLogIndex := r.getRaftLogIndex() + maxPeerIndex := uint64(0) + for _, peer := range r.discoveryService.GetAllPeers() { + if peer.NodeID != r.discoverConfig.RaftAdvAddress && peer.RaftLogIndex > maxPeerIndex { + maxPeerIndex = peer.RaftLogIndex + } + } + + if ourLogIndex == 0 && maxPeerIndex > 0 { + _ = r.clearRaftState(rqliteDataDir) + _ = r.discoveryService.ForceWritePeersJSON() + } + } + + r.discoveryService.TriggerSync() + time.Sleep(2 * time.Second) + + return nil +} + +// recoverCluster restarts RQLite using peers.json +func (r *RQLiteManager) recoverCluster(ctx context.Context, peersJSONPath string) error { + _ = r.Stop() + time.Sleep(2 * time.Second) + + rqliteDataDir, err := r.rqliteDataDirPath() + if err != nil { + return err + } + + if err := r.launchProcess(ctx, rqliteDataDir); err != nil { + return err + } + + return r.waitForReadyAndConnect(ctx) +} + +// recoverFromSplitBrain automatically recovers from split-brain state +func (r *RQLiteManager) recoverFromSplitBrain(ctx context.Context) error { + if r.discoveryService == nil { + return fmt.Errorf("discovery service not available") + } + + r.discoveryService.TriggerPeerExchange(ctx) + time.Sleep(2 * time.Second) + r.discoveryService.TriggerSync() + time.Sleep(2 * time.Second) + + rqliteDataDir, _ := r.rqliteDataDirPath() + ourIndex := r.getRaftLogIndex() + + maxPeerIndex := uint64(0) + for _, peer := range r.discoveryService.GetAllPeers() { + if peer.NodeID != r.discoverConfig.RaftAdvAddress && peer.RaftLogIndex > maxPeerIndex { + maxPeerIndex = peer.RaftLogIndex + } + } + + if ourIndex == 0 && maxPeerIndex > 0 { + _ = r.clearRaftState(rqliteDataDir) + r.discoveryService.TriggerPeerExchange(ctx) + time.Sleep(1 * time.Second) + _ = r.discoveryService.ForceWritePeersJSON() + return r.recoverCluster(ctx, filepath.Join(rqliteDataDir, "raft", "peers.json")) + } + + return nil +} + +// isInSplitBrainState detects if we're in a split-brain scenario +func (r *RQLiteManager) isInSplitBrainState() bool { + status, err := r.getRQLiteStatus() + if err != nil || r.discoveryService == nil { + return false + } + + raft := status.Store.Raft + if raft.State == "Follower" && raft.Term == 0 && raft.NumPeers == 0 && !raft.Voter { + peers := r.discoveryService.GetActivePeers() + if len(peers) == 0 { + return false + } + + reachableCount := 0 + splitBrainCount := 0 + for _, peer := range peers { + if r.isPeerReachable(peer.HTTPAddress) { + reachableCount++ + peerStatus, err := r.getPeerRQLiteStatus(peer.HTTPAddress) + if err == nil { + praft := peerStatus.Store.Raft + if praft.State == "Follower" && praft.Term == 0 && praft.NumPeers == 0 && !praft.Voter { + splitBrainCount++ + } + } + } + } + return reachableCount > 0 && splitBrainCount == reachableCount + } + return false +} + +func (r *RQLiteManager) isPeerReachable(httpAddr string) bool { + client := &http.Client{Timeout: 3 * time.Second} + resp, err := client.Get(fmt.Sprintf("http://%s/status", httpAddr)) + if err == nil { + resp.Body.Close() + return resp.StatusCode == http.StatusOK + } + return false +} + +func (r *RQLiteManager) getPeerRQLiteStatus(httpAddr string) (*RQLiteStatus, error) { + client := &http.Client{Timeout: 3 * time.Second} + resp, err := client.Get(fmt.Sprintf("http://%s/status", httpAddr)) + if err != nil { + return nil, err + } + defer resp.Body.Close() + var status RQLiteStatus + if err := json.NewDecoder(resp.Body).Decode(&status); err != nil { + return nil, err + } + return &status, nil +} + +func (r *RQLiteManager) startHealthMonitoring(ctx context.Context) { + time.Sleep(30 * time.Second) + ticker := time.NewTicker(60 * time.Second) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + if r.isInSplitBrainState() { + _ = r.recoverFromSplitBrain(ctx) + } + } + } +} + +// checkNeedsClusterRecovery checks if the node has old cluster state that requires coordinated recovery +func (r *RQLiteManager) checkNeedsClusterRecovery(rqliteDataDir string) (bool, error) { + snapshotsDir := filepath.Join(rqliteDataDir, "rsnapshots") + if _, err := os.Stat(snapshotsDir); os.IsNotExist(err) { + return false, nil + } + + entries, err := os.ReadDir(snapshotsDir) + if err != nil { + return false, err + } + + hasSnapshots := false + for _, entry := range entries { + if entry.IsDir() || strings.HasSuffix(entry.Name(), ".db") { + hasSnapshots = true + break + } + } + + if !hasSnapshots { + return false, nil + } + + raftLogPath := filepath.Join(rqliteDataDir, "raft.db") + if info, err := os.Stat(raftLogPath); err == nil { + if info.Size() <= 8*1024*1024 { + return true, nil + } + } + + return false, nil +} + +func (r *RQLiteManager) hasExistingRaftState(rqliteDataDir string) bool { + raftLogPath := filepath.Join(rqliteDataDir, "raft.db") + if info, err := os.Stat(raftLogPath); err == nil && info.Size() > 1024 { + return true + } + peersPath := filepath.Join(rqliteDataDir, "raft", "peers.json") + _, err := os.Stat(peersPath) + return err == nil +} + +func (r *RQLiteManager) clearRaftState(rqliteDataDir string) error { + _ = os.Remove(filepath.Join(rqliteDataDir, "raft.db")) + _ = os.Remove(filepath.Join(rqliteDataDir, "raft", "peers.json")) + return nil +} + diff --git a/pkg/rqlite/cluster_discovery.go b/pkg/rqlite/cluster_discovery.go index dd357da..72d3da3 100644 --- a/pkg/rqlite/cluster_discovery.go +++ b/pkg/rqlite/cluster_discovery.go @@ -2,20 +2,12 @@ package rqlite import ( "context" - "encoding/json" "fmt" - "net" - "net/netip" - "os" - "path/filepath" - "strings" "sync" "time" "github.com/DeBrosOfficial/network/pkg/discovery" "github.com/libp2p/go-libp2p/core/host" - "github.com/libp2p/go-libp2p/core/peer" - "github.com/multiformats/go-multiaddr" "go.uber.org/zap" ) @@ -160,855 +152,3 @@ func (c *ClusterDiscoveryService) periodicCleanup(ctx context.Context) { } } } - -// collectPeerMetadata collects RQLite metadata from LibP2P peers -func (c *ClusterDiscoveryService) collectPeerMetadata() []*discovery.RQLiteNodeMetadata { - connectedPeers := c.host.Network().Peers() - var metadata []*discovery.RQLiteNodeMetadata - - // Metadata collection is routine - no need to log every occurrence - - c.mu.RLock() - currentRaftAddr := c.raftAddress - currentHTTPAddr := c.httpAddress - c.mu.RUnlock() - - // Add ourselves - ourMetadata := &discovery.RQLiteNodeMetadata{ - NodeID: currentRaftAddr, // RQLite uses raft address as node ID - RaftAddress: currentRaftAddr, - HTTPAddress: currentHTTPAddr, - NodeType: c.nodeType, - RaftLogIndex: c.rqliteManager.getRaftLogIndex(), - LastSeen: time.Now(), - ClusterVersion: "1.0", - } - - if c.adjustSelfAdvertisedAddresses(ourMetadata) { - c.logger.Debug("Adjusted self-advertised RQLite addresses", - zap.String("raft_address", ourMetadata.RaftAddress), - zap.String("http_address", ourMetadata.HTTPAddress)) - } - - metadata = append(metadata, ourMetadata) - - staleNodeIDs := make([]string, 0) - - // Query connected peers for their RQLite metadata - // For now, we'll use a simple approach - store metadata in peer metadata store - // In a full implementation, this would use a custom protocol to exchange RQLite metadata - for _, peerID := range connectedPeers { - // Try to get stored metadata from peerstore - // This would be populated by a peer exchange protocol - if val, err := c.host.Peerstore().Get(peerID, "rqlite_metadata"); err == nil { - if jsonData, ok := val.([]byte); ok { - var peerMeta discovery.RQLiteNodeMetadata - if err := json.Unmarshal(jsonData, &peerMeta); err == nil { - if updated, stale := c.adjustPeerAdvertisedAddresses(peerID, &peerMeta); updated && stale != "" { - staleNodeIDs = append(staleNodeIDs, stale) - } - peerMeta.LastSeen = time.Now() - metadata = append(metadata, &peerMeta) - } - } - } - } - - // Clean up stale entries if NodeID changed - if len(staleNodeIDs) > 0 { - c.mu.Lock() - for _, id := range staleNodeIDs { - delete(c.knownPeers, id) - delete(c.peerHealth, id) - } - c.mu.Unlock() - } - - return metadata -} - -// membershipUpdateResult contains the result of a membership update operation -type membershipUpdateResult struct { - peersJSON []map[string]interface{} - added []string - updated []string - changed bool -} - -// updateClusterMembership updates the cluster membership based on discovered peers -func (c *ClusterDiscoveryService) updateClusterMembership() { - metadata := c.collectPeerMetadata() - - // Compute membership changes while holding lock - c.mu.Lock() - result := c.computeMembershipChangesLocked(metadata) - c.mu.Unlock() - - // Perform file I/O outside the lock - if result.changed { - // Log state changes (peer added/removed) at Info level - if len(result.added) > 0 || len(result.updated) > 0 { - c.logger.Info("Membership changed", - zap.Int("added", len(result.added)), - zap.Int("updated", len(result.updated)), - zap.Strings("added", result.added), - zap.Strings("updated", result.updated)) - } - - // Write peers.json without holding lock - if err := c.writePeersJSONWithData(result.peersJSON); err != nil { - c.logger.Error("Failed to write peers.json", - zap.Error(err), - zap.String("data_dir", c.dataDir), - zap.Int("peers", len(result.peersJSON))) - } else { - c.logger.Debug("peers.json updated", - zap.Int("peers", len(result.peersJSON))) - } - - // Update lastUpdate timestamp - c.mu.Lock() - c.lastUpdate = time.Now() - c.mu.Unlock() - } - // No changes - don't log (reduces noise) -} - -// computeMembershipChangesLocked computes membership changes and returns snapshot data -// Must be called with lock held -func (c *ClusterDiscoveryService) computeMembershipChangesLocked(metadata []*discovery.RQLiteNodeMetadata) membershipUpdateResult { - // Track changes - added := []string{} - updated := []string{} - - // Update known peers, but skip self for health tracking - for _, meta := range metadata { - // Skip self-metadata for health tracking (we only track remote peers) - isSelf := meta.NodeID == c.raftAddress - - if existing, ok := c.knownPeers[meta.NodeID]; ok { - // Update existing peer - if existing.RaftLogIndex != meta.RaftLogIndex || - existing.HTTPAddress != meta.HTTPAddress || - existing.RaftAddress != meta.RaftAddress { - updated = append(updated, meta.NodeID) - } - } else { - // New peer discovered - added = append(added, meta.NodeID) - c.logger.Info("Node added", - zap.String("node", meta.NodeID), - zap.String("raft", meta.RaftAddress), - zap.String("type", meta.NodeType), - zap.Uint64("log_index", meta.RaftLogIndex)) - } - - c.knownPeers[meta.NodeID] = meta - - // Update health tracking only for remote peers - if !isSelf { - if _, ok := c.peerHealth[meta.NodeID]; !ok { - c.peerHealth[meta.NodeID] = &PeerHealth{ - LastSeen: time.Now(), - LastSuccessful: time.Now(), - Status: "active", - } - } else { - c.peerHealth[meta.NodeID].LastSeen = time.Now() - c.peerHealth[meta.NodeID].Status = "active" - c.peerHealth[meta.NodeID].FailureCount = 0 - } - } - } - - // CRITICAL FIX: Count remote peers (excluding self) - remotePeerCount := 0 - for _, peer := range c.knownPeers { - if peer.NodeID != c.raftAddress { - remotePeerCount++ - } - } - - // Get peers JSON snapshot (for checking if it would be empty) - peers := c.getPeersJSONUnlocked() - - // Determine if we should write peers.json - shouldWrite := len(added) > 0 || len(updated) > 0 || c.lastUpdate.IsZero() - - // CRITICAL FIX: Don't write peers.json until we have minimum cluster size - // This prevents RQLite from starting as a single-node cluster - // For min_cluster_size=3, we need at least 2 remote peers (plus self = 3 total) - if shouldWrite { - // For initial sync, wait until we have at least (MinClusterSize - 1) remote peers - // This ensures peers.json contains enough peers for proper cluster formation - if c.lastUpdate.IsZero() { - requiredRemotePeers := c.minClusterSize - 1 - - if remotePeerCount < requiredRemotePeers { - c.logger.Info("Waiting for peers", - zap.Int("have", remotePeerCount), - zap.Int("need", requiredRemotePeers), - zap.Int("min_size", c.minClusterSize)) - return membershipUpdateResult{ - changed: false, - } - } - } - - // Additional safety check: don't write empty peers.json (would cause single-node cluster) - if len(peers) == 0 && c.lastUpdate.IsZero() { - c.logger.Info("No remote peers - waiting") - return membershipUpdateResult{ - changed: false, - } - } - - // Log initial sync if this is the first time - if c.lastUpdate.IsZero() { - c.logger.Info("Initial sync", - zap.Int("total", len(c.knownPeers)), - zap.Int("remote", remotePeerCount), - zap.Int("in_json", len(peers))) - } - - return membershipUpdateResult{ - peersJSON: peers, - added: added, - updated: updated, - changed: true, - } - } - - return membershipUpdateResult{ - changed: false, - } -} - -// removeInactivePeers removes peers that haven't been seen for longer than the inactivity limit -func (c *ClusterDiscoveryService) removeInactivePeers() { - c.mu.Lock() - defer c.mu.Unlock() - - now := time.Now() - removed := []string{} - - for nodeID, health := range c.peerHealth { - inactiveDuration := now.Sub(health.LastSeen) - - if inactiveDuration > c.inactivityLimit { - // Mark as inactive and remove - c.logger.Warn("Node removed", - zap.String("node", nodeID), - zap.String("reason", "inactive"), - zap.Duration("inactive_duration", inactiveDuration)) - - delete(c.knownPeers, nodeID) - delete(c.peerHealth, nodeID) - removed = append(removed, nodeID) - } - } - - // Regenerate peers.json if any peers were removed - if len(removed) > 0 { - c.logger.Info("Removed inactive", - zap.Int("count", len(removed)), - zap.Strings("nodes", removed)) - - if err := c.writePeersJSON(); err != nil { - c.logger.Error("Failed to write peers.json after cleanup", zap.Error(err)) - } - } -} - -// getPeersJSON generates the peers.json structure from active peers (acquires lock) -func (c *ClusterDiscoveryService) getPeersJSON() []map[string]interface{} { - c.mu.RLock() - defer c.mu.RUnlock() - return c.getPeersJSONUnlocked() -} - -// getPeersJSONUnlocked generates the peers.json structure (must be called with lock held) -func (c *ClusterDiscoveryService) getPeersJSONUnlocked() []map[string]interface{} { - peers := make([]map[string]interface{}, 0, len(c.knownPeers)) - - for _, peer := range c.knownPeers { - // CRITICAL FIX: Include ALL peers (including self) in peers.json - // When using expect configuration with recovery, RQLite needs the complete - // expected cluster configuration to properly form consensus. - // The peers.json file is used by RQLite's recovery mechanism to know - // what the full cluster membership should be, including the local node. - peerEntry := map[string]interface{}{ - "id": peer.RaftAddress, // RQLite uses raft address as node ID - "address": peer.RaftAddress, - "non_voter": false, - } - peers = append(peers, peerEntry) - } - - return peers -} - -// writePeersJSON atomically writes the peers.json file (acquires lock) -func (c *ClusterDiscoveryService) writePeersJSON() error { - c.mu.RLock() - peers := c.getPeersJSONUnlocked() - c.mu.RUnlock() - - return c.writePeersJSONWithData(peers) -} - -// writePeersJSONWithData writes the peers.json file with provided data (no lock needed) -func (c *ClusterDiscoveryService) writePeersJSONWithData(peers []map[string]interface{}) error { - // Expand ~ in data directory path - dataDir := os.ExpandEnv(c.dataDir) - if strings.HasPrefix(dataDir, "~") { - home, err := os.UserHomeDir() - if err != nil { - return fmt.Errorf("failed to determine home directory: %w", err) - } - dataDir = filepath.Join(home, dataDir[1:]) - } - - // Get the RQLite raft directory - rqliteDir := filepath.Join(dataDir, "rqlite", "raft") - - // Writing peers.json - routine operation, no need to log details - - if err := os.MkdirAll(rqliteDir, 0755); err != nil { - return fmt.Errorf("failed to create raft directory %s: %w", rqliteDir, err) - } - - peersFile := filepath.Join(rqliteDir, "peers.json") - backupFile := filepath.Join(rqliteDir, "peers.json.backup") - - // Backup existing peers.json if it exists - if _, err := os.Stat(peersFile); err == nil { - // Backup existing peers.json if it exists - routine operation - data, err := os.ReadFile(peersFile) - if err == nil { - _ = os.WriteFile(backupFile, data, 0644) - } - } - - // Marshal to JSON - data, err := json.MarshalIndent(peers, "", " ") - if err != nil { - return fmt.Errorf("failed to marshal peers.json: %w", err) - } - - // Marshaled peers.json - routine operation - - // Write atomically using temp file + rename - tempFile := peersFile + ".tmp" - if err := os.WriteFile(tempFile, data, 0644); err != nil { - return fmt.Errorf("failed to write temp peers.json %s: %w", tempFile, err) - } - - if err := os.Rename(tempFile, peersFile); err != nil { - return fmt.Errorf("failed to rename %s to %s: %w", tempFile, peersFile, err) - } - - nodeIDs := make([]string, 0, len(peers)) - for _, p := range peers { - if id, ok := p["id"].(string); ok { - nodeIDs = append(nodeIDs, id) - } - } - - c.logger.Info("peers.json written", - zap.Int("peers", len(peers)), - zap.Strings("nodes", nodeIDs)) - - return nil -} - -// GetActivePeers returns a list of active peers (not including self) -func (c *ClusterDiscoveryService) GetActivePeers() []*discovery.RQLiteNodeMetadata { - c.mu.RLock() - defer c.mu.RUnlock() - - peers := make([]*discovery.RQLiteNodeMetadata, 0, len(c.knownPeers)) - for _, peer := range c.knownPeers { - // Skip self (compare by raft address since that's the NodeID now) - if peer.NodeID == c.raftAddress { - continue - } - peers = append(peers, peer) - } - - return peers -} - -// GetAllPeers returns a list of all known peers (including self) -func (c *ClusterDiscoveryService) GetAllPeers() []*discovery.RQLiteNodeMetadata { - c.mu.RLock() - defer c.mu.RUnlock() - - peers := make([]*discovery.RQLiteNodeMetadata, 0, len(c.knownPeers)) - for _, peer := range c.knownPeers { - peers = append(peers, peer) - } - - return peers -} - -// GetNodeWithHighestLogIndex returns the node with the highest Raft log index -func (c *ClusterDiscoveryService) GetNodeWithHighestLogIndex() *discovery.RQLiteNodeMetadata { - c.mu.RLock() - defer c.mu.RUnlock() - - var highest *discovery.RQLiteNodeMetadata - var maxIndex uint64 = 0 - - for _, peer := range c.knownPeers { - // Skip self (compare by raft address since that's the NodeID now) - if peer.NodeID == c.raftAddress { - continue - } - - if peer.RaftLogIndex > maxIndex { - maxIndex = peer.RaftLogIndex - highest = peer - } - } - - return highest -} - -// HasRecentPeersJSON checks if peers.json was recently updated -func (c *ClusterDiscoveryService) HasRecentPeersJSON() bool { - c.mu.RLock() - defer c.mu.RUnlock() - - // Consider recent if updated in last 5 minutes - return time.Since(c.lastUpdate) < 5*time.Minute -} - -// FindJoinTargets discovers join targets via LibP2P -func (c *ClusterDiscoveryService) FindJoinTargets() []string { - c.mu.RLock() - defer c.mu.RUnlock() - - targets := []string{} - - // All nodes are equal - prioritize by Raft log index (more advanced = better) - type nodeWithIndex struct { - address string - logIndex uint64 - } - var nodes []nodeWithIndex - for _, peer := range c.knownPeers { - nodes = append(nodes, nodeWithIndex{peer.RaftAddress, peer.RaftLogIndex}) - } - - // Sort by log index descending (higher log index = more up-to-date) - for i := 0; i < len(nodes)-1; i++ { - for j := i + 1; j < len(nodes); j++ { - if nodes[j].logIndex > nodes[i].logIndex { - nodes[i], nodes[j] = nodes[j], nodes[i] - } - } - } - - for _, n := range nodes { - targets = append(targets, n.address) - } - - return targets -} - -// WaitForDiscoverySettling waits for LibP2P discovery to settle (used on concurrent startup) -func (c *ClusterDiscoveryService) WaitForDiscoverySettling(ctx context.Context) { - settleDuration := 60 * time.Second - c.logger.Info("Waiting for discovery to settle", - zap.Duration("duration", settleDuration)) - - select { - case <-ctx.Done(): - return - case <-time.After(settleDuration): - } - - // Collect final peer list - c.updateClusterMembership() - - c.mu.RLock() - peerCount := len(c.knownPeers) - c.mu.RUnlock() - - c.logger.Info("Discovery settled", - zap.Int("peer_count", peerCount)) -} - -// TriggerSync manually triggers a cluster membership sync -func (c *ClusterDiscoveryService) TriggerSync() { - // All nodes use the same discovery timing for consistency - c.updateClusterMembership() -} - -// ForceWritePeersJSON forces writing peers.json regardless of membership changes -// This is useful after clearing raft state when we need to recreate peers.json -func (c *ClusterDiscoveryService) ForceWritePeersJSON() error { - c.logger.Info("Force writing peers.json") - - // First, collect latest peer metadata to ensure we have current information - metadata := c.collectPeerMetadata() - - // Update known peers with latest metadata (without writing file yet) - c.mu.Lock() - for _, meta := range metadata { - c.knownPeers[meta.NodeID] = meta - // Update health tracking for remote peers - if meta.NodeID != c.raftAddress { - if _, ok := c.peerHealth[meta.NodeID]; !ok { - c.peerHealth[meta.NodeID] = &PeerHealth{ - LastSeen: time.Now(), - LastSuccessful: time.Now(), - Status: "active", - } - } else { - c.peerHealth[meta.NodeID].LastSeen = time.Now() - c.peerHealth[meta.NodeID].Status = "active" - } - } - } - peers := c.getPeersJSONUnlocked() - c.mu.Unlock() - - // Now force write the file - if err := c.writePeersJSONWithData(peers); err != nil { - c.logger.Error("Failed to force write peers.json", - zap.Error(err), - zap.String("data_dir", c.dataDir), - zap.Int("peers", len(peers))) - return err - } - - c.logger.Info("peers.json written", - zap.Int("peers", len(peers))) - - return nil -} - -// TriggerPeerExchange actively exchanges peer information with connected peers -// This populates the peerstore with RQLite metadata from other nodes -func (c *ClusterDiscoveryService) TriggerPeerExchange(ctx context.Context) error { - if c.discoveryMgr == nil { - return fmt.Errorf("discovery manager not available") - } - - collected := c.discoveryMgr.TriggerPeerExchange(ctx) - c.logger.Debug("Exchange completed", zap.Int("with_metadata", collected)) - - return nil -} - -// UpdateOwnMetadata updates our own RQLite metadata in the peerstore -func (c *ClusterDiscoveryService) UpdateOwnMetadata() { - c.mu.RLock() - currentRaftAddr := c.raftAddress - currentHTTPAddr := c.httpAddress - c.mu.RUnlock() - - metadata := &discovery.RQLiteNodeMetadata{ - NodeID: currentRaftAddr, // RQLite uses raft address as node ID - RaftAddress: currentRaftAddr, - HTTPAddress: currentHTTPAddr, - NodeType: c.nodeType, - RaftLogIndex: c.rqliteManager.getRaftLogIndex(), - LastSeen: time.Now(), - ClusterVersion: "1.0", - } - - // Adjust addresses if needed - if c.adjustSelfAdvertisedAddresses(metadata) { - c.logger.Debug("Adjusted self-advertised RQLite addresses in UpdateOwnMetadata", - zap.String("raft_address", metadata.RaftAddress), - zap.String("http_address", metadata.HTTPAddress)) - } - - // Store in our own peerstore for peer exchange - data, err := json.Marshal(metadata) - if err != nil { - c.logger.Error("Failed to marshal own metadata", zap.Error(err)) - return - } - - if err := c.host.Peerstore().Put(c.host.ID(), "rqlite_metadata", data); err != nil { - c.logger.Error("Failed to store own metadata", zap.Error(err)) - return - } - - c.logger.Debug("Metadata updated", - zap.String("node", metadata.NodeID), - zap.Uint64("log_index", metadata.RaftLogIndex)) -} - -// StoreRemotePeerMetadata stores metadata received from a remote peer -func (c *ClusterDiscoveryService) StoreRemotePeerMetadata(peerID peer.ID, metadata *discovery.RQLiteNodeMetadata) error { - if metadata == nil { - return fmt.Errorf("metadata is nil") - } - - // Adjust addresses if needed (replace localhost with actual IP) - if updated, stale := c.adjustPeerAdvertisedAddresses(peerID, metadata); updated && stale != "" { - // Clean up stale entry if NodeID changed - c.mu.Lock() - delete(c.knownPeers, stale) - delete(c.peerHealth, stale) - c.mu.Unlock() - } - - metadata.LastSeen = time.Now() - - data, err := json.Marshal(metadata) - if err != nil { - return fmt.Errorf("failed to marshal metadata: %w", err) - } - - if err := c.host.Peerstore().Put(peerID, "rqlite_metadata", data); err != nil { - return fmt.Errorf("failed to store metadata: %w", err) - } - - c.logger.Debug("Metadata stored", - zap.String("peer", shortPeerID(peerID)), - zap.String("node", metadata.NodeID)) - - return nil -} - -// adjustPeerAdvertisedAddresses adjusts peer metadata addresses by replacing localhost/loopback -// with the actual IP address from LibP2P connection. Returns (updated, staleNodeID). -// staleNodeID is non-empty if NodeID changed (indicating old entry should be cleaned up). -func (c *ClusterDiscoveryService) adjustPeerAdvertisedAddresses(peerID peer.ID, meta *discovery.RQLiteNodeMetadata) (bool, string) { - ip := c.selectPeerIP(peerID) - if ip == "" { - return false, "" - } - - changed, stale := rewriteAdvertisedAddresses(meta, ip, true) - if changed { - c.logger.Debug("Addresses normalized", - zap.String("peer", shortPeerID(peerID)), - zap.String("raft", meta.RaftAddress), - zap.String("http_address", meta.HTTPAddress)) - } - return changed, stale -} - -// adjustSelfAdvertisedAddresses adjusts our own metadata addresses by replacing localhost/loopback -// with the actual IP address from LibP2P host. Updates internal state if changed. -func (c *ClusterDiscoveryService) adjustSelfAdvertisedAddresses(meta *discovery.RQLiteNodeMetadata) bool { - ip := c.selectSelfIP() - if ip == "" { - return false - } - - changed, _ := rewriteAdvertisedAddresses(meta, ip, true) - if !changed { - return false - } - - // Update internal state with corrected addresses - c.mu.Lock() - c.raftAddress = meta.RaftAddress - c.httpAddress = meta.HTTPAddress - c.mu.Unlock() - - if c.rqliteManager != nil { - c.rqliteManager.UpdateAdvertisedAddresses(meta.RaftAddress, meta.HTTPAddress) - } - - return true -} - -// selectPeerIP selects the best IP address for a peer from LibP2P connections. -// Prefers public IPs, falls back to private IPs if no public IP is available. -func (c *ClusterDiscoveryService) selectPeerIP(peerID peer.ID) string { - var fallback string - - // First, try to get IP from active connections - for _, conn := range c.host.Network().ConnsToPeer(peerID) { - if ip, public := ipFromMultiaddr(conn.RemoteMultiaddr()); ip != "" { - if shouldReplaceHost(ip) { - continue - } - if public { - return ip - } - if fallback == "" { - fallback = ip - } - } - } - - // Fallback to peerstore addresses - for _, addr := range c.host.Peerstore().Addrs(peerID) { - if ip, public := ipFromMultiaddr(addr); ip != "" { - if shouldReplaceHost(ip) { - continue - } - if public { - return ip - } - if fallback == "" { - fallback = ip - } - } - } - - return fallback -} - -// selectSelfIP selects the best IP address for ourselves from LibP2P host addresses. -// Prefers public IPs, falls back to private IPs if no public IP is available. -func (c *ClusterDiscoveryService) selectSelfIP() string { - var fallback string - - for _, addr := range c.host.Addrs() { - if ip, public := ipFromMultiaddr(addr); ip != "" { - if shouldReplaceHost(ip) { - continue - } - if public { - return ip - } - if fallback == "" { - fallback = ip - } - } - } - - return fallback -} - -// rewriteAdvertisedAddresses rewrites RaftAddress and HTTPAddress in metadata, -// replacing localhost/loopback addresses with the provided IP. -// Returns (changed, staleNodeID). staleNodeID is non-empty if NodeID changed. -func rewriteAdvertisedAddresses(meta *discovery.RQLiteNodeMetadata, newHost string, allowNodeIDRewrite bool) (bool, string) { - if meta == nil || newHost == "" { - return false, "" - } - - originalNodeID := meta.NodeID - changed := false - nodeIDChanged := false - - // Replace host in RaftAddress if it's localhost/loopback - if newAddr, replaced := replaceAddressHost(meta.RaftAddress, newHost); replaced { - if meta.RaftAddress != newAddr { - meta.RaftAddress = newAddr - changed = true - } - } - - // Replace host in HTTPAddress if it's localhost/loopback - if newAddr, replaced := replaceAddressHost(meta.HTTPAddress, newHost); replaced { - if meta.HTTPAddress != newAddr { - meta.HTTPAddress = newAddr - changed = true - } - } - - // Update NodeID to match RaftAddress if it changed - if allowNodeIDRewrite { - if meta.RaftAddress != "" && (meta.NodeID == "" || meta.NodeID == originalNodeID || shouldReplaceHost(hostFromAddress(meta.NodeID))) { - if meta.NodeID != meta.RaftAddress { - meta.NodeID = meta.RaftAddress - nodeIDChanged = meta.NodeID != originalNodeID - if nodeIDChanged { - changed = true - } - } - } - } - - if nodeIDChanged { - return changed, originalNodeID - } - return changed, "" -} - -// replaceAddressHost replaces the host part of an address if it's localhost/loopback. -// Returns (newAddress, replaced). replaced is true if host was replaced. -func replaceAddressHost(address, newHost string) (string, bool) { - if address == "" || newHost == "" { - return address, false - } - - host, port, err := net.SplitHostPort(address) - if err != nil { - return address, false - } - - if !shouldReplaceHost(host) { - return address, false - } - - return net.JoinHostPort(newHost, port), true -} - -// shouldReplaceHost returns true if the host should be replaced (localhost, loopback, etc.) -func shouldReplaceHost(host string) bool { - if host == "" { - return true - } - if strings.EqualFold(host, "localhost") { - return true - } - - // Check if it's a loopback or unspecified address - if addr, err := netip.ParseAddr(host); err == nil { - if addr.IsLoopback() || addr.IsUnspecified() { - return true - } - } - - return false -} - -// hostFromAddress extracts the host part from a host:port address -func hostFromAddress(address string) string { - host, _, err := net.SplitHostPort(address) - if err != nil { - return "" - } - return host -} - -// ipFromMultiaddr extracts an IP address from a multiaddr and returns (ip, isPublic) -func ipFromMultiaddr(addr multiaddr.Multiaddr) (string, bool) { - if addr == nil { - return "", false - } - - if v4, err := addr.ValueForProtocol(multiaddr.P_IP4); err == nil { - return v4, isPublicIP(v4) - } - if v6, err := addr.ValueForProtocol(multiaddr.P_IP6); err == nil { - return v6, isPublicIP(v6) - } - return "", false -} - -// isPublicIP returns true if the IP is a public (non-private, non-loopback) address -func isPublicIP(ip string) bool { - addr, err := netip.ParseAddr(ip) - if err != nil { - return false - } - // Exclude loopback, unspecified, link-local, multicast, and private addresses - if addr.IsLoopback() || addr.IsUnspecified() || addr.IsLinkLocalUnicast() || addr.IsLinkLocalMulticast() || addr.IsPrivate() { - return false - } - return true -} - -// shortPeerID returns a shortened version of a peer ID for logging -func shortPeerID(id peer.ID) string { - s := id.String() - if len(s) <= 8 { - return s - } - return s[:8] + "..." -} diff --git a/pkg/rqlite/cluster_discovery_membership.go b/pkg/rqlite/cluster_discovery_membership.go new file mode 100644 index 0000000..55065f3 --- /dev/null +++ b/pkg/rqlite/cluster_discovery_membership.go @@ -0,0 +1,318 @@ +package rqlite + +import ( + "encoding/json" + "fmt" + "os" + "path/filepath" + "strings" + "time" + + "github.com/DeBrosOfficial/network/pkg/discovery" + "go.uber.org/zap" +) + +// collectPeerMetadata collects RQLite metadata from LibP2P peers +func (c *ClusterDiscoveryService) collectPeerMetadata() []*discovery.RQLiteNodeMetadata { + connectedPeers := c.host.Network().Peers() + var metadata []*discovery.RQLiteNodeMetadata + + c.mu.RLock() + currentRaftAddr := c.raftAddress + currentHTTPAddr := c.httpAddress + c.mu.RUnlock() + + // Add ourselves + ourMetadata := &discovery.RQLiteNodeMetadata{ + NodeID: currentRaftAddr, // RQLite uses raft address as node ID + RaftAddress: currentRaftAddr, + HTTPAddress: currentHTTPAddr, + NodeType: c.nodeType, + RaftLogIndex: c.rqliteManager.getRaftLogIndex(), + LastSeen: time.Now(), + ClusterVersion: "1.0", + } + + if c.adjustSelfAdvertisedAddresses(ourMetadata) { + c.logger.Debug("Adjusted self-advertised RQLite addresses", + zap.String("raft_address", ourMetadata.RaftAddress), + zap.String("http_address", ourMetadata.HTTPAddress)) + } + + metadata = append(metadata, ourMetadata) + + staleNodeIDs := make([]string, 0) + + for _, peerID := range connectedPeers { + if val, err := c.host.Peerstore().Get(peerID, "rqlite_metadata"); err == nil { + if jsonData, ok := val.([]byte); ok { + var peerMeta discovery.RQLiteNodeMetadata + if err := json.Unmarshal(jsonData, &peerMeta); err == nil { + if updated, stale := c.adjustPeerAdvertisedAddresses(peerID, &peerMeta); updated && stale != "" { + staleNodeIDs = append(staleNodeIDs, stale) + } + peerMeta.LastSeen = time.Now() + metadata = append(metadata, &peerMeta) + } + } + } + } + + if len(staleNodeIDs) > 0 { + c.mu.Lock() + for _, id := range staleNodeIDs { + delete(c.knownPeers, id) + delete(c.peerHealth, id) + } + c.mu.Unlock() + } + + return metadata +} + +type membershipUpdateResult struct { + peersJSON []map[string]interface{} + added []string + updated []string + changed bool +} + +func (c *ClusterDiscoveryService) updateClusterMembership() { + metadata := c.collectPeerMetadata() + + c.mu.Lock() + result := c.computeMembershipChangesLocked(metadata) + c.mu.Unlock() + + if result.changed { + if len(result.added) > 0 || len(result.updated) > 0 { + c.logger.Info("Membership changed", + zap.Int("added", len(result.added)), + zap.Int("updated", len(result.updated)), + zap.Strings("added", result.added), + zap.Strings("updated", result.updated)) + } + + if err := c.writePeersJSONWithData(result.peersJSON); err != nil { + c.logger.Error("Failed to write peers.json", + zap.Error(err), + zap.String("data_dir", c.dataDir), + zap.Int("peers", len(result.peersJSON))) + } else { + c.logger.Debug("peers.json updated", + zap.Int("peers", len(result.peersJSON))) + } + + c.mu.Lock() + c.lastUpdate = time.Now() + c.mu.Unlock() + } +} + +func (c *ClusterDiscoveryService) computeMembershipChangesLocked(metadata []*discovery.RQLiteNodeMetadata) membershipUpdateResult { + added := []string{} + updated := []string{} + + for _, meta := range metadata { + isSelf := meta.NodeID == c.raftAddress + + if existing, ok := c.knownPeers[meta.NodeID]; ok { + if existing.RaftLogIndex != meta.RaftLogIndex || + existing.HTTPAddress != meta.HTTPAddress || + existing.RaftAddress != meta.RaftAddress { + updated = append(updated, meta.NodeID) + } + } else { + added = append(added, meta.NodeID) + c.logger.Info("Node added", + zap.String("node", meta.NodeID), + zap.String("raft", meta.RaftAddress), + zap.String("type", meta.NodeType), + zap.Uint64("log_index", meta.RaftLogIndex)) + } + + c.knownPeers[meta.NodeID] = meta + + if !isSelf { + if _, ok := c.peerHealth[meta.NodeID]; !ok { + c.peerHealth[meta.NodeID] = &PeerHealth{ + LastSeen: time.Now(), + LastSuccessful: time.Now(), + Status: "active", + } + } else { + c.peerHealth[meta.NodeID].LastSeen = time.Now() + c.peerHealth[meta.NodeID].Status = "active" + c.peerHealth[meta.NodeID].FailureCount = 0 + } + } + } + + remotePeerCount := 0 + for _, peer := range c.knownPeers { + if peer.NodeID != c.raftAddress { + remotePeerCount++ + } + } + + peers := c.getPeersJSONUnlocked() + shouldWrite := len(added) > 0 || len(updated) > 0 || c.lastUpdate.IsZero() + + if shouldWrite { + if c.lastUpdate.IsZero() { + requiredRemotePeers := c.minClusterSize - 1 + + if remotePeerCount < requiredRemotePeers { + c.logger.Info("Waiting for peers", + zap.Int("have", remotePeerCount), + zap.Int("need", requiredRemotePeers), + zap.Int("min_size", c.minClusterSize)) + return membershipUpdateResult{ + changed: false, + } + } + } + + if len(peers) == 0 && c.lastUpdate.IsZero() { + c.logger.Info("No remote peers - waiting") + return membershipUpdateResult{ + changed: false, + } + } + + if c.lastUpdate.IsZero() { + c.logger.Info("Initial sync", + zap.Int("total", len(c.knownPeers)), + zap.Int("remote", remotePeerCount), + zap.Int("in_json", len(peers))) + } + + return membershipUpdateResult{ + peersJSON: peers, + added: added, + updated: updated, + changed: true, + } + } + + return membershipUpdateResult{ + changed: false, + } +} + +func (c *ClusterDiscoveryService) removeInactivePeers() { + c.mu.Lock() + defer c.mu.Unlock() + + now := time.Now() + removed := []string{} + + for nodeID, health := range c.peerHealth { + inactiveDuration := now.Sub(health.LastSeen) + + if inactiveDuration > c.inactivityLimit { + c.logger.Warn("Node removed", + zap.String("node", nodeID), + zap.String("reason", "inactive"), + zap.Duration("inactive_duration", inactiveDuration)) + + delete(c.knownPeers, nodeID) + delete(c.peerHealth, nodeID) + removed = append(removed, nodeID) + } + } + + if len(removed) > 0 { + c.logger.Info("Removed inactive", + zap.Int("count", len(removed)), + zap.Strings("nodes", removed)) + + if err := c.writePeersJSON(); err != nil { + c.logger.Error("Failed to write peers.json after cleanup", zap.Error(err)) + } + } +} + +func (c *ClusterDiscoveryService) getPeersJSON() []map[string]interface{} { + c.mu.RLock() + defer c.mu.RUnlock() + return c.getPeersJSONUnlocked() +} + +func (c *ClusterDiscoveryService) getPeersJSONUnlocked() []map[string]interface{} { + peers := make([]map[string]interface{}, 0, len(c.knownPeers)) + + for _, peer := range c.knownPeers { + peerEntry := map[string]interface{}{ + "id": peer.RaftAddress, + "address": peer.RaftAddress, + "non_voter": false, + } + peers = append(peers, peerEntry) + } + + return peers +} + +func (c *ClusterDiscoveryService) writePeersJSON() error { + c.mu.RLock() + peers := c.getPeersJSONUnlocked() + c.mu.RUnlock() + + return c.writePeersJSONWithData(peers) +} + +func (c *ClusterDiscoveryService) writePeersJSONWithData(peers []map[string]interface{}) error { + dataDir := os.ExpandEnv(c.dataDir) + if strings.HasPrefix(dataDir, "~") { + home, err := os.UserHomeDir() + if err != nil { + return fmt.Errorf("failed to determine home directory: %w", err) + } + dataDir = filepath.Join(home, dataDir[1:]) + } + + rqliteDir := filepath.Join(dataDir, "rqlite", "raft") + + if err := os.MkdirAll(rqliteDir, 0755); err != nil { + return fmt.Errorf("failed to create raft directory %s: %w", rqliteDir, err) + } + + peersFile := filepath.Join(rqliteDir, "peers.json") + backupFile := filepath.Join(rqliteDir, "peers.json.backup") + + if _, err := os.Stat(peersFile); err == nil { + data, err := os.ReadFile(peersFile) + if err == nil { + _ = os.WriteFile(backupFile, data, 0644) + } + } + + data, err := json.MarshalIndent(peers, "", " ") + if err != nil { + return fmt.Errorf("failed to marshal peers.json: %w", err) + } + + tempFile := peersFile + ".tmp" + if err := os.WriteFile(tempFile, data, 0644); err != nil { + return fmt.Errorf("failed to write temp peers.json %s: %w", tempFile, err) + } + + if err := os.Rename(tempFile, peersFile); err != nil { + return fmt.Errorf("failed to rename %s to %s: %w", tempFile, peersFile, err) + } + + nodeIDs := make([]string, 0, len(peers)) + for _, p := range peers { + if id, ok := p["id"].(string); ok { + nodeIDs = append(nodeIDs, id) + } + } + + c.logger.Info("peers.json written", + zap.Int("peers", len(peers)), + zap.Strings("nodes", nodeIDs)) + + return nil +} + diff --git a/pkg/rqlite/cluster_discovery_queries.go b/pkg/rqlite/cluster_discovery_queries.go new file mode 100644 index 0000000..3d0960f --- /dev/null +++ b/pkg/rqlite/cluster_discovery_queries.go @@ -0,0 +1,251 @@ +package rqlite + +import ( + "context" + "encoding/json" + "fmt" + "time" + + "github.com/DeBrosOfficial/network/pkg/discovery" + "github.com/libp2p/go-libp2p/core/peer" + "go.uber.org/zap" +) + +// GetActivePeers returns a list of active peers (not including self) +func (c *ClusterDiscoveryService) GetActivePeers() []*discovery.RQLiteNodeMetadata { + c.mu.RLock() + defer c.mu.RUnlock() + + peers := make([]*discovery.RQLiteNodeMetadata, 0, len(c.knownPeers)) + for _, peer := range c.knownPeers { + if peer.NodeID == c.raftAddress { + continue + } + peers = append(peers, peer) + } + + return peers +} + +// GetAllPeers returns a list of all known peers (including self) +func (c *ClusterDiscoveryService) GetAllPeers() []*discovery.RQLiteNodeMetadata { + c.mu.RLock() + defer c.mu.RUnlock() + + peers := make([]*discovery.RQLiteNodeMetadata, 0, len(c.knownPeers)) + for _, peer := range c.knownPeers { + peers = append(peers, peer) + } + + return peers +} + +// GetNodeWithHighestLogIndex returns the node with the highest Raft log index +func (c *ClusterDiscoveryService) GetNodeWithHighestLogIndex() *discovery.RQLiteNodeMetadata { + c.mu.RLock() + defer c.mu.RUnlock() + + var highest *discovery.RQLiteNodeMetadata + var maxIndex uint64 = 0 + + for _, peer := range c.knownPeers { + if peer.NodeID == c.raftAddress { + continue + } + + if peer.RaftLogIndex > maxIndex { + maxIndex = peer.RaftLogIndex + highest = peer + } + } + + return highest +} + +// HasRecentPeersJSON checks if peers.json was recently updated +func (c *ClusterDiscoveryService) HasRecentPeersJSON() bool { + c.mu.RLock() + defer c.mu.RUnlock() + + return time.Since(c.lastUpdate) < 5*time.Minute +} + +// FindJoinTargets discovers join targets via LibP2P +func (c *ClusterDiscoveryService) FindJoinTargets() []string { + c.mu.RLock() + defer c.mu.RUnlock() + + targets := []string{} + + type nodeWithIndex struct { + address string + logIndex uint64 + } + var nodes []nodeWithIndex + for _, peer := range c.knownPeers { + nodes = append(nodes, nodeWithIndex{peer.RaftAddress, peer.RaftLogIndex}) + } + + for i := 0; i < len(nodes)-1; i++ { + for j := i + 1; j < len(nodes); j++ { + if nodes[j].logIndex > nodes[i].logIndex { + nodes[i], nodes[j] = nodes[j], nodes[i] + } + } + } + + for _, n := range nodes { + targets = append(targets, n.address) + } + + return targets +} + +// WaitForDiscoverySettling waits for LibP2P discovery to settle (used on concurrent startup) +func (c *ClusterDiscoveryService) WaitForDiscoverySettling(ctx context.Context) { + settleDuration := 60 * time.Second + c.logger.Info("Waiting for discovery to settle", + zap.Duration("duration", settleDuration)) + + select { + case <-ctx.Done(): + return + case <-time.After(settleDuration): + } + + c.updateClusterMembership() + + c.mu.RLock() + peerCount := len(c.knownPeers) + c.mu.RUnlock() + + c.logger.Info("Discovery settled", + zap.Int("peer_count", peerCount)) +} + +// TriggerSync manually triggers a cluster membership sync +func (c *ClusterDiscoveryService) TriggerSync() { + c.updateClusterMembership() +} + +// ForceWritePeersJSON forces writing peers.json regardless of membership changes +func (c *ClusterDiscoveryService) ForceWritePeersJSON() error { + c.logger.Info("Force writing peers.json") + + metadata := c.collectPeerMetadata() + + c.mu.Lock() + for _, meta := range metadata { + c.knownPeers[meta.NodeID] = meta + if meta.NodeID != c.raftAddress { + if _, ok := c.peerHealth[meta.NodeID]; !ok { + c.peerHealth[meta.NodeID] = &PeerHealth{ + LastSeen: time.Now(), + LastSuccessful: time.Now(), + Status: "active", + } + } else { + c.peerHealth[meta.NodeID].LastSeen = time.Now() + c.peerHealth[meta.NodeID].Status = "active" + } + } + } + peers := c.getPeersJSONUnlocked() + c.mu.Unlock() + + if err := c.writePeersJSONWithData(peers); err != nil { + c.logger.Error("Failed to force write peers.json", + zap.Error(err), + zap.String("data_dir", c.dataDir), + zap.Int("peers", len(peers))) + return err + } + + c.logger.Info("peers.json written", + zap.Int("peers", len(peers))) + + return nil +} + +// TriggerPeerExchange actively exchanges peer information with connected peers +func (c *ClusterDiscoveryService) TriggerPeerExchange(ctx context.Context) error { + if c.discoveryMgr == nil { + return fmt.Errorf("discovery manager not available") + } + + collected := c.discoveryMgr.TriggerPeerExchange(ctx) + c.logger.Debug("Exchange completed", zap.Int("with_metadata", collected)) + + return nil +} + +// UpdateOwnMetadata updates our own RQLite metadata in the peerstore +func (c *ClusterDiscoveryService) UpdateOwnMetadata() { + c.mu.RLock() + currentRaftAddr := c.raftAddress + currentHTTPAddr := c.httpAddress + c.mu.RUnlock() + + metadata := &discovery.RQLiteNodeMetadata{ + NodeID: currentRaftAddr, + RaftAddress: currentRaftAddr, + HTTPAddress: currentHTTPAddr, + NodeType: c.nodeType, + RaftLogIndex: c.rqliteManager.getRaftLogIndex(), + LastSeen: time.Now(), + ClusterVersion: "1.0", + } + + if c.adjustSelfAdvertisedAddresses(metadata) { + c.logger.Debug("Adjusted self-advertised RQLite addresses in UpdateOwnMetadata", + zap.String("raft_address", metadata.RaftAddress), + zap.String("http_address", metadata.HTTPAddress)) + } + + data, err := json.Marshal(metadata) + if err != nil { + c.logger.Error("Failed to marshal own metadata", zap.Error(err)) + return + } + + if err := c.host.Peerstore().Put(c.host.ID(), "rqlite_metadata", data); err != nil { + c.logger.Error("Failed to store own metadata", zap.Error(err)) + return + } + + c.logger.Debug("Metadata updated", + zap.String("node", metadata.NodeID), + zap.Uint64("log_index", metadata.RaftLogIndex)) +} + +// StoreRemotePeerMetadata stores metadata received from a remote peer +func (c *ClusterDiscoveryService) StoreRemotePeerMetadata(peerID peer.ID, metadata *discovery.RQLiteNodeMetadata) error { + if metadata == nil { + return fmt.Errorf("metadata is nil") + } + + if updated, stale := c.adjustPeerAdvertisedAddresses(peerID, metadata); updated && stale != "" { + c.mu.Lock() + delete(c.knownPeers, stale) + delete(c.peerHealth, stale) + c.mu.Unlock() + } + + metadata.LastSeen = time.Now() + + data, err := json.Marshal(metadata) + if err != nil { + return fmt.Errorf("failed to marshal metadata: %w", err) + } + + if err := c.host.Peerstore().Put(peerID, "rqlite_metadata", data); err != nil { + return fmt.Errorf("failed to store metadata: %w", err) + } + + c.logger.Debug("Metadata stored", + zap.String("peer", shortPeerID(peerID)), + zap.String("node", metadata.NodeID)) + + return nil +} + diff --git a/pkg/rqlite/cluster_discovery_utils.go b/pkg/rqlite/cluster_discovery_utils.go new file mode 100644 index 0000000..d71e370 --- /dev/null +++ b/pkg/rqlite/cluster_discovery_utils.go @@ -0,0 +1,233 @@ +package rqlite + +import ( + "net" + "net/netip" + "strings" + + "github.com/DeBrosOfficial/network/pkg/discovery" + "github.com/libp2p/go-libp2p/core/peer" + "github.com/multiformats/go-multiaddr" + "go.uber.org/zap" +) + +// adjustPeerAdvertisedAddresses adjusts peer metadata addresses +func (c *ClusterDiscoveryService) adjustPeerAdvertisedAddresses(peerID peer.ID, meta *discovery.RQLiteNodeMetadata) (bool, string) { + ip := c.selectPeerIP(peerID) + if ip == "" { + return false, "" + } + + changed, stale := rewriteAdvertisedAddresses(meta, ip, true) + if changed { + c.logger.Debug("Addresses normalized", + zap.String("peer", shortPeerID(peerID)), + zap.String("raft", meta.RaftAddress), + zap.String("http_address", meta.HTTPAddress)) + } + return changed, stale +} + +// adjustSelfAdvertisedAddresses adjusts our own metadata addresses +func (c *ClusterDiscoveryService) adjustSelfAdvertisedAddresses(meta *discovery.RQLiteNodeMetadata) bool { + ip := c.selectSelfIP() + if ip == "" { + return false + } + + changed, _ := rewriteAdvertisedAddresses(meta, ip, true) + if !changed { + return false + } + + c.mu.Lock() + c.raftAddress = meta.RaftAddress + c.httpAddress = meta.HTTPAddress + c.mu.Unlock() + + if c.rqliteManager != nil { + c.rqliteManager.UpdateAdvertisedAddresses(meta.RaftAddress, meta.HTTPAddress) + } + + return true +} + +// selectPeerIP selects the best IP address for a peer +func (c *ClusterDiscoveryService) selectPeerIP(peerID peer.ID) string { + var fallback string + + for _, conn := range c.host.Network().ConnsToPeer(peerID) { + if ip, public := ipFromMultiaddr(conn.RemoteMultiaddr()); ip != "" { + if shouldReplaceHost(ip) { + continue + } + if public { + return ip + } + if fallback == "" { + fallback = ip + } + } + } + + for _, addr := range c.host.Peerstore().Addrs(peerID) { + if ip, public := ipFromMultiaddr(addr); ip != "" { + if shouldReplaceHost(ip) { + continue + } + if public { + return ip + } + if fallback == "" { + fallback = ip + } + } + } + + return fallback +} + +// selectSelfIP selects the best IP address for ourselves +func (c *ClusterDiscoveryService) selectSelfIP() string { + var fallback string + + for _, addr := range c.host.Addrs() { + if ip, public := ipFromMultiaddr(addr); ip != "" { + if shouldReplaceHost(ip) { + continue + } + if public { + return ip + } + if fallback == "" { + fallback = ip + } + } + } + + return fallback +} + +// rewriteAdvertisedAddresses rewrites RaftAddress and HTTPAddress in metadata +func rewriteAdvertisedAddresses(meta *discovery.RQLiteNodeMetadata, newHost string, allowNodeIDRewrite bool) (bool, string) { + if meta == nil || newHost == "" { + return false, "" + } + + originalNodeID := meta.NodeID + changed := false + nodeIDChanged := false + + if newAddr, replaced := replaceAddressHost(meta.RaftAddress, newHost); replaced { + if meta.RaftAddress != newAddr { + meta.RaftAddress = newAddr + changed = true + } + } + + if newAddr, replaced := replaceAddressHost(meta.HTTPAddress, newHost); replaced { + if meta.HTTPAddress != newAddr { + meta.HTTPAddress = newAddr + changed = true + } + } + + if allowNodeIDRewrite { + if meta.RaftAddress != "" && (meta.NodeID == "" || meta.NodeID == originalNodeID || shouldReplaceHost(hostFromAddress(meta.NodeID))) { + if meta.NodeID != meta.RaftAddress { + meta.NodeID = meta.RaftAddress + nodeIDChanged = meta.NodeID != originalNodeID + if nodeIDChanged { + changed = true + } + } + } + } + + if nodeIDChanged { + return changed, originalNodeID + } + return changed, "" +} + +// replaceAddressHost replaces the host part of an address +func replaceAddressHost(address, newHost string) (string, bool) { + if address == "" || newHost == "" { + return address, false + } + + host, port, err := net.SplitHostPort(address) + if err != nil { + return address, false + } + + if !shouldReplaceHost(host) { + return address, false + } + + return net.JoinHostPort(newHost, port), true +} + +// shouldReplaceHost returns true if the host should be replaced +func shouldReplaceHost(host string) bool { + if host == "" { + return true + } + if strings.EqualFold(host, "localhost") { + return true + } + + if addr, err := netip.ParseAddr(host); err == nil { + if addr.IsLoopback() || addr.IsUnspecified() { + return true + } + } + + return false +} + +// hostFromAddress extracts the host part from a host:port address +func hostFromAddress(address string) string { + host, _, err := net.SplitHostPort(address) + if err != nil { + return "" + } + return host +} + +// ipFromMultiaddr extracts an IP address from a multiaddr and returns (ip, isPublic) +func ipFromMultiaddr(addr multiaddr.Multiaddr) (string, bool) { + if addr == nil { + return "", false + } + + if v4, err := addr.ValueForProtocol(multiaddr.P_IP4); err == nil { + return v4, isPublicIP(v4) + } + if v6, err := addr.ValueForProtocol(multiaddr.P_IP6); err == nil { + return v6, isPublicIP(v6) + } + return "", false +} + +// isPublicIP returns true if the IP is a public address +func isPublicIP(ip string) bool { + addr, err := netip.ParseAddr(ip) + if err != nil { + return false + } + if addr.IsLoopback() || addr.IsUnspecified() || addr.IsLinkLocalUnicast() || addr.IsLinkLocalMulticast() || addr.IsPrivate() { + return false + } + return true +} + +// shortPeerID returns a shortened version of a peer ID +func shortPeerID(id peer.ID) string { + s := id.String() + if len(s) <= 8 { + return s + } + return s[:8] + "..." +} + diff --git a/pkg/rqlite/discovery_manager.go b/pkg/rqlite/discovery_manager.go new file mode 100644 index 0000000..2728239 --- /dev/null +++ b/pkg/rqlite/discovery_manager.go @@ -0,0 +1,61 @@ +package rqlite + +import ( + "fmt" + "time" +) + +// SetDiscoveryService sets the cluster discovery service +func (r *RQLiteManager) SetDiscoveryService(service *ClusterDiscoveryService) { + r.discoveryService = service +} + +// SetNodeType sets the node type +func (r *RQLiteManager) SetNodeType(nodeType string) { + if nodeType != "" { + r.nodeType = nodeType + } +} + +// UpdateAdvertisedAddresses overrides advertised addresses +func (r *RQLiteManager) UpdateAdvertisedAddresses(raftAddr, httpAddr string) { + if r == nil || r.discoverConfig == nil { + return + } + if raftAddr != "" && r.discoverConfig.RaftAdvAddress != raftAddr { + r.discoverConfig.RaftAdvAddress = raftAddr + } + if httpAddr != "" && r.discoverConfig.HttpAdvAddress != httpAddr { + r.discoverConfig.HttpAdvAddress = httpAddr + } +} + +func (r *RQLiteManager) validateNodeID() error { + for i := 0; i < 5; i++ { + nodes, err := r.getRQLiteNodes() + if err != nil { + if i < 4 { + time.Sleep(500 * time.Millisecond) + continue + } + return nil + } + + expectedID := r.discoverConfig.RaftAdvAddress + if expectedID == "" || len(nodes) == 0 { + return nil + } + + for _, node := range nodes { + if node.Address == expectedID { + if node.ID != expectedID { + return fmt.Errorf("node ID mismatch: %s != %s", expectedID, node.ID) + } + return nil + } + } + return nil + } + return nil +} + diff --git a/pkg/rqlite/process.go b/pkg/rqlite/process.go new file mode 100644 index 0000000..b11ffa4 --- /dev/null +++ b/pkg/rqlite/process.go @@ -0,0 +1,239 @@ +package rqlite + +import ( + "context" + "encoding/json" + "fmt" + "io" + "net/http" + "os" + "os/exec" + "path/filepath" + "strings" + "time" + + "github.com/DeBrosOfficial/network/pkg/tlsutil" + "github.com/rqlite/gorqlite" + "go.uber.org/zap" +) + +// launchProcess starts the RQLite process with appropriate arguments +func (r *RQLiteManager) launchProcess(ctx context.Context, rqliteDataDir string) error { + // Build RQLite command + args := []string{ + "-http-addr", fmt.Sprintf("0.0.0.0:%d", r.config.RQLitePort), + "-http-adv-addr", r.discoverConfig.HttpAdvAddress, + "-raft-adv-addr", r.discoverConfig.RaftAdvAddress, + "-raft-addr", fmt.Sprintf("0.0.0.0:%d", r.config.RQLiteRaftPort), + } + + if r.config.NodeCert != "" && r.config.NodeKey != "" { + r.logger.Info("Enabling node-to-node TLS encryption", + zap.String("node_cert", r.config.NodeCert), + zap.String("node_key", r.config.NodeKey)) + + args = append(args, "-node-cert", r.config.NodeCert) + args = append(args, "-node-key", r.config.NodeKey) + + if r.config.NodeCACert != "" { + args = append(args, "-node-ca-cert", r.config.NodeCACert) + } + if r.config.NodeNoVerify { + args = append(args, "-node-no-verify") + } + } + + if r.config.RQLiteJoinAddress != "" { + r.logger.Info("Joining RQLite cluster", zap.String("join_address", r.config.RQLiteJoinAddress)) + + joinArg := r.config.RQLiteJoinAddress + if strings.HasPrefix(joinArg, "http://") { + joinArg = strings.TrimPrefix(joinArg, "http://") + } else if strings.HasPrefix(joinArg, "https://") { + joinArg = strings.TrimPrefix(joinArg, "https://") + } + + joinTimeout := 5 * time.Minute + if err := r.waitForJoinTarget(ctx, r.config.RQLiteJoinAddress, joinTimeout); err != nil { + r.logger.Warn("Join target did not become reachable within timeout; will still attempt to join", + zap.Error(err)) + } + + args = append(args, "-join", joinArg, "-join-as", r.discoverConfig.RaftAdvAddress, "-join-attempts", "30", "-join-interval", "10s") + } + + args = append(args, rqliteDataDir) + + r.cmd = exec.Command("rqlited", args...) + + nodeType := r.nodeType + if nodeType == "" { + nodeType = "node" + } + + logsDir := filepath.Join(filepath.Dir(r.dataDir), "logs") + _ = os.MkdirAll(logsDir, 0755) + + logPath := filepath.Join(logsDir, fmt.Sprintf("rqlite-%s.log", nodeType)) + logFile, err := os.OpenFile(logPath, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0644) + if err != nil { + return fmt.Errorf("failed to open log file: %w", err) + } + + r.cmd.Stdout = logFile + r.cmd.Stderr = logFile + + if err := r.cmd.Start(); err != nil { + logFile.Close() + return fmt.Errorf("failed to start RQLite: %w", err) + } + + logFile.Close() + return nil +} + +// waitForReadyAndConnect waits for RQLite to be ready and establishes connection +func (r *RQLiteManager) waitForReadyAndConnect(ctx context.Context) error { + if err := r.waitForReady(ctx); err != nil { + if r.cmd != nil && r.cmd.Process != nil { + _ = r.cmd.Process.Kill() + } + return err + } + + var conn *gorqlite.Connection + var err error + maxConnectAttempts := 10 + connectBackoff := 500 * time.Millisecond + + for attempt := 0; attempt < maxConnectAttempts; attempt++ { + conn, err = gorqlite.Open(fmt.Sprintf("http://localhost:%d", r.config.RQLitePort)) + if err == nil { + r.connection = conn + break + } + + if strings.Contains(err.Error(), "store is not open") { + time.Sleep(connectBackoff) + connectBackoff = time.Duration(float64(connectBackoff) * 1.5) + if connectBackoff > 5*time.Second { + connectBackoff = 5 * time.Second + } + continue + } + + if r.cmd != nil && r.cmd.Process != nil { + _ = r.cmd.Process.Kill() + } + return fmt.Errorf("failed to connect to RQLite: %w", err) + } + + if conn == nil { + return fmt.Errorf("failed to connect to RQLite after max attempts") + } + + _ = r.validateNodeID() + return nil +} + +// waitForReady waits for RQLite to be ready to accept connections +func (r *RQLiteManager) waitForReady(ctx context.Context) error { + url := fmt.Sprintf("http://localhost:%d/status", r.config.RQLitePort) + client := tlsutil.NewHTTPClient(2 * time.Second) + + for i := 0; i < 180; i++ { + select { + case <-ctx.Done(): + return ctx.Err() + case <-time.After(1 * time.Second): + } + + resp, err := client.Get(url) + if err == nil && resp.StatusCode == http.StatusOK { + body, _ := io.ReadAll(resp.Body) + resp.Body.Close() + var statusResp map[string]interface{} + if err := json.Unmarshal(body, &statusResp); err == nil { + if raft, ok := statusResp["raft"].(map[string]interface{}); ok { + state, _ := raft["state"].(string) + if state == "leader" || state == "follower" { + return nil + } + } else { + return nil // Backwards compatibility + } + } + } + } + + return fmt.Errorf("RQLite did not become ready within timeout") +} + +// waitForSQLAvailable waits until a simple query succeeds +func (r *RQLiteManager) waitForSQLAvailable(ctx context.Context) error { + ticker := time.NewTicker(1 * time.Second) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return ctx.Err() + case <-ticker.C: + if r.connection == nil { + continue + } + _, err := r.connection.QueryOne("SELECT 1") + if err == nil { + return nil + } + } + } +} + +// testJoinAddress tests if a join address is reachable +func (r *RQLiteManager) testJoinAddress(joinAddress string) error { + client := tlsutil.NewHTTPClient(5 * time.Second) + var statusURL string + if strings.HasPrefix(joinAddress, "http://") || strings.HasPrefix(joinAddress, "https://") { + statusURL = strings.TrimRight(joinAddress, "/") + "/status" + } else { + host := joinAddress + if idx := strings.Index(joinAddress, ":"); idx != -1 { + host = joinAddress[:idx] + } + statusURL = fmt.Sprintf("http://%s:%d/status", host, 5001) + } + + resp, err := client.Get(statusURL) + if err != nil { + return err + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { + return fmt.Errorf("leader returned status %d", resp.StatusCode) + } + return nil +} + +// waitForJoinTarget waits until the join target's HTTP status becomes reachable +func (r *RQLiteManager) waitForJoinTarget(ctx context.Context, joinAddress string, timeout time.Duration) error { + deadline := time.Now().Add(timeout) + var lastErr error + + for time.Now().Before(deadline) { + if err := r.testJoinAddress(joinAddress); err == nil { + return nil + } else { + lastErr = err + } + + select { + case <-ctx.Done(): + return ctx.Err() + case <-time.After(2 * time.Second): + } + } + + return lastErr +} + diff --git a/pkg/rqlite/rqlite.go b/pkg/rqlite/rqlite.go index 3597f65..087b6e2 100644 --- a/pkg/rqlite/rqlite.go +++ b/pkg/rqlite/rqlite.go @@ -2,23 +2,14 @@ package rqlite import ( "context" - "encoding/json" - "errors" "fmt" - "io" - "net/http" - "os" "os/exec" - "path/filepath" - "strings" "syscall" "time" + "github.com/DeBrosOfficial/network/pkg/config" "github.com/rqlite/gorqlite" "go.uber.org/zap" - - "github.com/DeBrosOfficial/network/pkg/config" - "github.com/DeBrosOfficial/network/pkg/tlsutil" ) // RQLiteManager manages an RQLite node instance @@ -33,40 +24,6 @@ type RQLiteManager struct { discoveryService *ClusterDiscoveryService } -// waitForSQLAvailable waits until a simple query succeeds, indicating a leader is known and queries can be served. -func (r *RQLiteManager) waitForSQLAvailable(ctx context.Context) error { - ticker := time.NewTicker(1 * time.Second) - defer ticker.Stop() - - attempts := 0 - for { - select { - case <-ctx.Done(): - return ctx.Err() - case <-ticker.C: - // Check for nil connection inside the loop to handle cases where - // connection becomes nil during restart/recovery operations - if r.connection == nil { - attempts++ - if attempts%5 == 0 { // log every ~5s to reduce noise - r.logger.Debug("Waiting for RQLite connection to be established") - } - continue - } - - attempts++ - _, err := r.connection.QueryOne("SELECT 1") - if err == nil { - r.logger.Info("RQLite SQL is available") - return nil - } - if attempts%5 == 0 { // log every ~5s to reduce noise - r.logger.Debug("Waiting for RQLite SQL availability", zap.Error(err)) - } - } - } -} - // NewRQLiteManager creates a new RQLite manager func NewRQLiteManager(cfg *config.DatabaseConfig, discoveryCfg *config.DiscoveryConfig, dataDir string, logger *zap.Logger) *RQLiteManager { return &RQLiteManager{ @@ -77,36 +34,6 @@ func NewRQLiteManager(cfg *config.DatabaseConfig, discoveryCfg *config.Discovery } } -// SetDiscoveryService sets the cluster discovery service for this RQLite manager -func (r *RQLiteManager) SetDiscoveryService(service *ClusterDiscoveryService) { - r.discoveryService = service -} - -// SetNodeType sets the node type for this RQLite manager -func (r *RQLiteManager) SetNodeType(nodeType string) { - if nodeType != "" { - r.nodeType = nodeType - } -} - -// UpdateAdvertisedAddresses overrides the discovery advertised addresses when cluster discovery -// infers a better host than what was provided via configuration (e.g. replacing localhost). -func (r *RQLiteManager) UpdateAdvertisedAddresses(raftAddr, httpAddr string) { - if r == nil || r.discoverConfig == nil { - return - } - - if raftAddr != "" && r.discoverConfig.RaftAdvAddress != raftAddr { - r.logger.Info("Updating Raft advertised address", zap.String("addr", raftAddr)) - r.discoverConfig.RaftAdvAddress = raftAddr - } - - if httpAddr != "" && r.discoverConfig.HttpAdvAddress != httpAddr { - r.logger.Info("Updating HTTP advertised address", zap.String("addr", httpAddr)) - r.discoverConfig.HttpAdvAddress = httpAddr - } -} - // Start starts the RQLite node func (r *RQLiteManager) Start(ctx context.Context) error { rqliteDataDir, err := r.prepareDataDir() @@ -118,434 +45,40 @@ func (r *RQLiteManager) Start(ctx context.Context) error { return fmt.Errorf("discovery config HttpAdvAddress is empty") } - // CRITICAL FIX: Ensure peers.json exists with minimum cluster size BEFORE starting RQLite - // This prevents split-brain where each node starts as a single-node cluster - // We NEVER start as a single-node cluster - we wait indefinitely until minimum cluster size is met - // This applies to ALL nodes (with or without join addresses) if r.discoveryService != nil { - r.logger.Info("Ensuring peers.json exists with minimum cluster size before RQLite startup", - zap.String("policy", "will wait indefinitely - never start as single-node cluster"), - zap.Bool("has_join_address", r.config.RQLiteJoinAddress != "")) - - // Wait for peer discovery to find minimum cluster size - NO TIMEOUT - // This ensures we never start as a single-node cluster, regardless of join address if err := r.waitForMinClusterSizeBeforeStart(ctx, rqliteDataDir); err != nil { - r.logger.Error("Failed to ensure minimum cluster size before start", - zap.Error(err), - zap.String("action", "startup aborted - will not start as single-node cluster")) - return fmt.Errorf("cannot start RQLite: minimum cluster size not met: %w", err) + return err } } - // CRITICAL: Check if we need to do pre-start cluster discovery to build peers.json - // This handles the case where nodes have old cluster state and need coordinated recovery - if needsClusterRecovery, err := r.checkNeedsClusterRecovery(rqliteDataDir); err != nil { - return fmt.Errorf("failed to check cluster recovery status: %w", err) - } else if needsClusterRecovery { - r.logger.Info("Detected old cluster state requiring coordinated recovery") + if needsClusterRecovery, err := r.checkNeedsClusterRecovery(rqliteDataDir); err == nil && needsClusterRecovery { if err := r.performPreStartClusterDiscovery(ctx, rqliteDataDir); err != nil { - return fmt.Errorf("pre-start cluster discovery failed: %w", err) + return err } } - // Launch RQLite process if err := r.launchProcess(ctx, rqliteDataDir); err != nil { return err } - // Wait for RQLite to be ready and establish connection if err := r.waitForReadyAndConnect(ctx); err != nil { return err } - // Start periodic health monitoring for automatic recovery if r.discoveryService != nil { go r.startHealthMonitoring(ctx) } - // Establish leadership/SQL availability if err := r.establishLeadershipOrJoin(ctx, rqliteDataDir); err != nil { return err } - // Apply migrations - resolve path for production vs development - migrationsDir, err := r.resolveMigrationsDir() - if err != nil { - r.logger.Error("Failed to resolve migrations directory", zap.Error(err)) - return fmt.Errorf("resolve migrations directory: %w", err) - } - if err := r.ApplyMigrations(ctx, migrationsDir); err != nil { - r.logger.Error("Migrations failed", zap.Error(err), zap.String("dir", migrationsDir)) - return fmt.Errorf("apply migrations: %w", err) - } - - r.logger.Info("RQLite node started successfully") - return nil -} - -// rqliteDataDirPath returns the resolved path to the RQLite data directory -// This centralizes the path resolution logic used throughout the codebase -func (r *RQLiteManager) rqliteDataDirPath() (string, error) { - // Expand ~ in data directory path - dataDir := os.ExpandEnv(r.dataDir) - if strings.HasPrefix(dataDir, "~") { - home, err := os.UserHomeDir() - if err != nil { - return "", fmt.Errorf("failed to determine home directory: %w", err) - } - dataDir = filepath.Join(home, dataDir[1:]) - } - - return filepath.Join(dataDir, "rqlite"), nil -} - -// resolveMigrationsDir resolves the migrations directory path for production vs development -// In production, migrations are at /home/debros/src/migrations -// In development, migrations are relative to the project root (migrations/) -func (r *RQLiteManager) resolveMigrationsDir() (string, error) { - // Check for production path first: /home/debros/src/migrations - productionPath := "/home/debros/src/migrations" - if _, err := os.Stat(productionPath); err == nil { - r.logger.Info("Using production migrations directory", zap.String("path", productionPath)) - return productionPath, nil - } - - // Fall back to relative path for development - devPath := "migrations" - r.logger.Info("Using development migrations directory", zap.String("path", devPath)) - return devPath, nil -} - -// prepareDataDir expands and creates the RQLite data directory -func (r *RQLiteManager) prepareDataDir() (string, error) { - rqliteDataDir, err := r.rqliteDataDirPath() - if err != nil { - return "", err - } - - // Create data directory - if err := os.MkdirAll(rqliteDataDir, 0755); err != nil { - return "", fmt.Errorf("failed to create RQLite data directory: %w", err) - } - - return rqliteDataDir, nil -} - -// launchProcess starts the RQLite process with appropriate arguments -func (r *RQLiteManager) launchProcess(ctx context.Context, rqliteDataDir string) error { - // Build RQLite command - args := []string{ - "-http-addr", fmt.Sprintf("0.0.0.0:%d", r.config.RQLitePort), - "-http-adv-addr", r.discoverConfig.HttpAdvAddress, - "-raft-adv-addr", r.discoverConfig.RaftAdvAddress, - "-raft-addr", fmt.Sprintf("0.0.0.0:%d", r.config.RQLiteRaftPort), - } - - // Add node-to-node TLS encryption if configured - // This enables TLS for Raft inter-node communication, required for SNI gateway routing - // See: https://rqlite.io/docs/guides/security/#encrypting-node-to-node-communication - if r.config.NodeCert != "" && r.config.NodeKey != "" { - r.logger.Info("Enabling node-to-node TLS encryption", - zap.String("node_cert", r.config.NodeCert), - zap.String("node_key", r.config.NodeKey), - zap.String("node_ca_cert", r.config.NodeCACert), - zap.Bool("node_no_verify", r.config.NodeNoVerify)) - - args = append(args, "-node-cert", r.config.NodeCert) - args = append(args, "-node-key", r.config.NodeKey) - - if r.config.NodeCACert != "" { - args = append(args, "-node-ca-cert", r.config.NodeCACert) - } - if r.config.NodeNoVerify { - args = append(args, "-node-no-verify") - } - } - - // All nodes follow the same join logic - either join specified address or start as single-node cluster - if r.config.RQLiteJoinAddress != "" { - r.logger.Info("Joining RQLite cluster", zap.String("join_address", r.config.RQLiteJoinAddress)) - - // Normalize join address to host:port for rqlited -join - joinArg := r.config.RQLiteJoinAddress - if strings.HasPrefix(joinArg, "http://") { - joinArg = strings.TrimPrefix(joinArg, "http://") - } else if strings.HasPrefix(joinArg, "https://") { - joinArg = strings.TrimPrefix(joinArg, "https://") - } - - // Wait for join target to become reachable to avoid forming a separate cluster - // Use 5 minute timeout to prevent infinite waits on bad configurations - joinTimeout := 5 * time.Minute - if err := r.waitForJoinTarget(ctx, r.config.RQLiteJoinAddress, joinTimeout); err != nil { - r.logger.Warn("Join target did not become reachable within timeout; will still attempt to join", - zap.String("join_address", r.config.RQLiteJoinAddress), - zap.Duration("timeout", joinTimeout), - zap.Error(err)) - } - - // Always add the join parameter in host:port form - let rqlited handle the rest - // Add retry parameters to handle slow cluster startup (e.g., during recovery) - // Include -join-as with the raft advertise address so the leader knows which node this is - args = append(args, "-join", joinArg, "-join-as", r.discoverConfig.RaftAdvAddress, "-join-attempts", "30", "-join-interval", "10s") - } else { - r.logger.Info("No join address specified - starting as single-node cluster") - // When no join address is provided, rqlited will start as a single-node cluster - // This is expected for the first node in a fresh cluster - } - - // Add data directory as positional argument - args = append(args, rqliteDataDir) - - r.logger.Info("Starting RQLite node", - zap.String("data_dir", rqliteDataDir), - zap.Int("http_port", r.config.RQLitePort), - zap.Int("raft_port", r.config.RQLiteRaftPort), - zap.String("join_address", r.config.RQLiteJoinAddress)) - - // Start RQLite process (not bound to ctx for graceful Stop handling) - r.cmd = exec.Command("rqlited", args...) - - // Setup log file for RQLite output - // Determine node type for log filename - nodeType := r.nodeType - if nodeType == "" { - nodeType = "node" - } - - // Create logs directory - logsDir := filepath.Join(filepath.Dir(r.dataDir), "logs") - if err := os.MkdirAll(logsDir, 0755); err != nil { - return fmt.Errorf("failed to create logs directory at %s: %w", logsDir, err) - } - - // Open log file for RQLite output - logPath := filepath.Join(logsDir, fmt.Sprintf("rqlite-%s.log", nodeType)) - logFile, err := os.OpenFile(logPath, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0644) - if err != nil { - return fmt.Errorf("failed to open RQLite log file at %s: %w", logPath, err) - } - - r.logger.Info("RQLite logs will be written to file", - zap.String("path", logPath)) - - r.cmd.Stdout = logFile - r.cmd.Stderr = logFile - - if err := r.cmd.Start(); err != nil { - logFile.Close() - return fmt.Errorf("failed to start RQLite: %w", err) - } - - // Close the log file handle after process starts (the subprocess maintains its own reference) - // This allows the file to be rotated or inspected while the process is running - logFile.Close() + migrationsDir, _ := r.resolveMigrationsDir() + _ = r.ApplyMigrations(ctx, migrationsDir) return nil } -// waitForReadyAndConnect waits for RQLite to be ready and establishes connection -// For joining nodes, retries if gorqlite.Open fails with "store is not open" error -func (r *RQLiteManager) waitForReadyAndConnect(ctx context.Context) error { - // Wait for RQLite to be ready - if err := r.waitForReady(ctx); err != nil { - if r.cmd != nil && r.cmd.Process != nil { - _ = r.cmd.Process.Kill() - } - return fmt.Errorf("RQLite failed to become ready: %w", err) - } - - // For joining nodes, retry gorqlite.Open if store is not yet open - // This handles recovery scenarios where the store opens after HTTP is responsive - var conn *gorqlite.Connection - var err error - maxConnectAttempts := 10 - connectBackoff := 500 * time.Millisecond - - for attempt := 0; attempt < maxConnectAttempts; attempt++ { - // Create connection - conn, err = gorqlite.Open(fmt.Sprintf("http://localhost:%d", r.config.RQLitePort)) - if err == nil { - // Success - r.connection = conn - r.logger.Debug("Successfully connected to RQLite", zap.Int("attempt", attempt+1)) - break - } - - // Check if error is "store is not open" (recovery scenario) - if strings.Contains(err.Error(), "store is not open") { - if attempt < maxConnectAttempts-1 { - // Retry with exponential backoff for all nodes during recovery - // The store may not open immediately, especially during cluster recovery - if attempt%3 == 0 { - r.logger.Debug("RQLite store not yet accessible for connection, retrying...", - zap.Int("attempt", attempt+1), zap.Error(err)) - } - time.Sleep(connectBackoff) - connectBackoff = time.Duration(float64(connectBackoff) * 1.5) - if connectBackoff > 5*time.Second { - connectBackoff = 5 * time.Second - } - continue - } - } - - // For any other error or final attempt, fail - if r.cmd != nil && r.cmd.Process != nil { - _ = r.cmd.Process.Kill() - } - return fmt.Errorf("failed to connect to RQLite: %w", err) - } - - if conn == nil { - if r.cmd != nil && r.cmd.Process != nil { - _ = r.cmd.Process.Kill() - } - return fmt.Errorf("failed to establish RQLite connection after %d attempts", maxConnectAttempts) - } - - // Sanity check: verify rqlite's node ID matches our configured raft address - if err := r.validateNodeID(); err != nil { - r.logger.Debug("Node ID validation skipped", zap.Error(err)) - // Don't fail startup, but log at debug level - } - - return nil -} - -// establishLeadershipOrJoin handles post-startup cluster establishment -// All nodes follow the same pattern: wait for SQL availability -// For nodes without a join address, RQLite automatically forms a single-node cluster and becomes leader -func (r *RQLiteManager) establishLeadershipOrJoin(ctx context.Context, rqliteDataDir string) error { - if r.config.RQLiteJoinAddress == "" { - // First node - no join address specified - // RQLite will automatically form a single-node cluster and become leader - r.logger.Info("Starting as first node in cluster") - - // Wait for SQL to be available (indicates RQLite cluster is ready) - sqlCtx := ctx - if _, hasDeadline := ctx.Deadline(); !hasDeadline { - var cancel context.CancelFunc - sqlCtx, cancel = context.WithTimeout(context.Background(), 2*time.Minute) - defer cancel() - } - - if err := r.waitForSQLAvailable(sqlCtx); err != nil { - if r.cmd != nil && r.cmd.Process != nil { - _ = r.cmd.Process.Kill() - } - return fmt.Errorf("SQL not available for first node: %w", err) - } - - r.logger.Info("First node established successfully") - return nil - } - - // Joining node - wait for SQL availability (indicates it joined the leader) - r.logger.Info("Waiting for RQLite SQL availability (joining cluster)") - sqlCtx := ctx - if _, hasDeadline := ctx.Deadline(); !hasDeadline { - var cancel context.CancelFunc - sqlCtx, cancel = context.WithTimeout(context.Background(), 5*time.Minute) - defer cancel() - } - - if err := r.waitForSQLAvailable(sqlCtx); err != nil { - if r.cmd != nil && r.cmd.Process != nil { - _ = r.cmd.Process.Kill() - } - return fmt.Errorf("RQLite SQL not available: %w", err) - } - - r.logger.Info("Node successfully joined cluster") - return nil -} - -// hasExistingState returns true if the rqlite data directory already contains files or subdirectories. -func (r *RQLiteManager) hasExistingState(rqliteDataDir string) bool { - entries, err := os.ReadDir(rqliteDataDir) - if err != nil { - return false - } - for _, e := range entries { - // Any existing file or directory indicates prior state - if e.Name() == "." || e.Name() == ".." { - continue - } - return true - } - return false -} - -// waitForReady waits for RQLite to be ready to accept connections -// It checks for HTTP 200 + valid raft state (leader/follower) -// The store may not be fully open initially during recovery, but connection retries will handle it -// For joining nodes in recovery, this may take longer (up to 3 minutes) -func (r *RQLiteManager) waitForReady(ctx context.Context) error { - url := fmt.Sprintf("http://localhost:%d/status", r.config.RQLitePort) - client := tlsutil.NewHTTPClient(2 * time.Second) - - // All nodes may need time to open the store during recovery - // Use consistent timeout for cluster consistency - maxAttempts := 180 // 180 seconds (3 minutes) for all nodes - - for i := 0; i < maxAttempts; i++ { - select { - case <-ctx.Done(): - return ctx.Err() - default: - } - - // Use centralized TLS configuration - if client == nil { - client = tlsutil.NewHTTPClient(2 * time.Second) - } - - resp, err := client.Get(url) - if err == nil && resp.StatusCode == http.StatusOK { - // Parse the response to check for valid raft state - body, err := io.ReadAll(resp.Body) - resp.Body.Close() - if err == nil { - var statusResp map[string]interface{} - if err := json.Unmarshal(body, &statusResp); err == nil { - // Check for valid raft state (leader or follower) - // If raft is established, we consider the node ready even if store.open is false - // The store will eventually open during recovery, and connection retries will handle it - if raft, ok := statusResp["raft"].(map[string]interface{}); ok { - state, ok := raft["state"].(string) - if ok && (state == "leader" || state == "follower") { - r.logger.Debug("RQLite raft ready", zap.String("state", state), zap.Int("attempt", i+1)) - return nil - } - // Raft not yet ready (likely in candidate state) - if i%10 == 0 { - r.logger.Debug("RQLite raft not yet ready", zap.String("state", state), zap.Int("attempt", i+1)) - } - } else { - // If no raft field, fall back to treating HTTP 200 as ready - // (for backwards compatibility with older RQLite versions) - r.logger.Debug("RQLite HTTP responsive (no raft field)", zap.Int("attempt", i+1)) - return nil - } - } else { - resp.Body.Close() - } - } - } else if err != nil && i%20 == 0 { - // Log connection errors only periodically (every ~20s) - r.logger.Debug("RQLite not yet reachable", zap.Int("attempt", i+1), zap.Error(err)) - } else if resp != nil { - resp.Body.Close() - } - - time.Sleep(1 * time.Second) - } - - return fmt.Errorf("RQLite did not become ready within timeout") -} - -// GetConnection returns the RQLite connection // GetConnection returns the RQLite connection func (r *RQLiteManager) GetConnection() *gorqlite.Connection { return r.connection @@ -562,806 +95,16 @@ func (r *RQLiteManager) Stop() error { return nil } - r.logger.Info("Stopping RQLite node (graceful)") - // Try SIGTERM first - if err := r.cmd.Process.Signal(syscall.SIGTERM); err != nil { - // Fallback to Kill if signaling fails - _ = r.cmd.Process.Kill() - return nil - } - - // Wait up to 5 seconds for graceful shutdown + _ = r.cmd.Process.Signal(syscall.SIGTERM) + done := make(chan error, 1) go func() { done <- r.cmd.Wait() }() select { - case err := <-done: - if err != nil && !errors.Is(err, os.ErrClosed) { - r.logger.Warn("RQLite process exited with error", zap.Error(err)) - } + case <-done: case <-time.After(5 * time.Second): - r.logger.Warn("RQLite did not exit in time; killing") _ = r.cmd.Process.Kill() } return nil } - -// waitForJoinTarget waits until the join target's HTTP status becomes reachable, or until timeout -func (r *RQLiteManager) waitForJoinTarget(ctx context.Context, joinAddress string, timeout time.Duration) error { - var deadline time.Time - if timeout > 0 { - deadline = time.Now().Add(timeout) - } - var lastErr error - - for { - if err := r.testJoinAddress(joinAddress); err == nil { - r.logger.Info("Join target is reachable, proceeding with cluster join") - return nil - } else { - lastErr = err - r.logger.Debug("Join target not yet reachable; waiting...", zap.String("join_address", joinAddress), zap.Error(err)) - } - - // Check context - select { - case <-ctx.Done(): - return ctx.Err() - case <-time.After(2 * time.Second): - } - - if !deadline.IsZero() && time.Now().After(deadline) { - break - } - } - - return lastErr -} - -// waitForMinClusterSizeBeforeStart waits for minimum cluster size to be discovered -// and ensures peers.json exists before RQLite starts -// CRITICAL: This function waits INDEFINITELY - it will NEVER timeout -// We never start as a single-node cluster, regardless of how long we wait -func (r *RQLiteManager) waitForMinClusterSizeBeforeStart(ctx context.Context, rqliteDataDir string) error { - if r.discoveryService == nil { - return fmt.Errorf("discovery service not available") - } - - requiredRemotePeers := r.config.MinClusterSize - 1 - r.logger.Info("Waiting for minimum cluster size before RQLite startup", - zap.Int("min_cluster_size", r.config.MinClusterSize), - zap.Int("required_remote_peers", requiredRemotePeers), - zap.String("policy", "waiting indefinitely - will never start as single-node cluster")) - - // Trigger peer exchange to collect metadata - if err := r.discoveryService.TriggerPeerExchange(ctx); err != nil { - r.logger.Warn("Peer exchange failed", zap.Error(err)) - } - - // NO TIMEOUT - wait indefinitely until minimum cluster size is met - // Only exit on context cancellation or when minimum cluster size is achieved - checkInterval := 2 * time.Second - lastLogTime := time.Now() - - for { - // Check context cancellation first - select { - case <-ctx.Done(): - return fmt.Errorf("context cancelled while waiting for minimum cluster size: %w", ctx.Err()) - default: - } - - // Trigger sync to update knownPeers - r.discoveryService.TriggerSync() - time.Sleep(checkInterval) - - // Check if we have enough remote peers - allPeers := r.discoveryService.GetAllPeers() - remotePeerCount := 0 - for _, peer := range allPeers { - if peer.NodeID != r.discoverConfig.RaftAdvAddress { - remotePeerCount++ - } - } - - if remotePeerCount >= requiredRemotePeers { - // Found enough peers - verify peers.json exists and contains them - peersPath := filepath.Join(rqliteDataDir, "raft", "peers.json") - - // Trigger one more sync to ensure peers.json is written - r.discoveryService.TriggerSync() - time.Sleep(2 * time.Second) - - // Verify peers.json exists and contains enough peers - if info, err := os.Stat(peersPath); err == nil && info.Size() > 10 { - // Read and verify it contains enough peers - data, err := os.ReadFile(peersPath) - if err == nil { - var peers []map[string]interface{} - if err := json.Unmarshal(data, &peers); err == nil && len(peers) >= requiredRemotePeers { - r.logger.Info("peers.json exists with minimum cluster size, safe to start RQLite", - zap.String("peers_file", peersPath), - zap.Int("remote_peers_discovered", remotePeerCount), - zap.Int("peers_in_json", len(peers)), - zap.Int("min_cluster_size", r.config.MinClusterSize)) - return nil - } - } - } - } - - // Log progress every 10 seconds - if time.Since(lastLogTime) >= 10*time.Second { - r.logger.Info("Waiting for minimum cluster size (indefinitely)...", - zap.Int("discovered_peers", len(allPeers)), - zap.Int("remote_peers", remotePeerCount), - zap.Int("required_remote_peers", requiredRemotePeers), - zap.String("status", "will continue waiting until minimum cluster size is met")) - lastLogTime = time.Now() - } - } -} - -// testJoinAddress tests if a join address is reachable -func (r *RQLiteManager) testJoinAddress(joinAddress string) error { - // Determine the HTTP status URL to probe. - // If joinAddress contains a scheme, use it directly. Otherwise treat joinAddress - // as host:port (Raft) and probe the standard HTTP API port 5001 on that host. - client := tlsutil.NewHTTPClient(5 * time.Second) - - var statusURL string - if strings.HasPrefix(joinAddress, "http://") || strings.HasPrefix(joinAddress, "https://") { - statusURL = strings.TrimRight(joinAddress, "/") + "/status" - } else { - // Extract host from host:port - host := joinAddress - if idx := strings.Index(joinAddress, ":"); idx != -1 { - host = joinAddress[:idx] - } - statusURL = fmt.Sprintf("http://%s:%d/status", host, 5001) - } - - r.logger.Debug("Testing join target via HTTP", zap.String("url", statusURL)) - resp, err := client.Get(statusURL) - if err != nil { - return fmt.Errorf("failed to connect to leader HTTP at %s: %w", statusURL, err) - } - defer resp.Body.Close() - if resp.StatusCode != http.StatusOK { - return fmt.Errorf("leader HTTP at %s returned status %d", statusURL, resp.StatusCode) - } - - r.logger.Info("Leader HTTP reachable", zap.String("status_url", statusURL)) - return nil -} - -// exponentialBackoff calculates exponential backoff duration with jitter -func (r *RQLiteManager) exponentialBackoff(attempt int, baseDelay time.Duration, maxDelay time.Duration) time.Duration { - // Calculate exponential backoff: baseDelay * 2^attempt - delay := baseDelay * time.Duration(1< maxDelay { - delay = maxDelay - } - - // Add jitter (±20%) - jitter := time.Duration(float64(delay) * 0.2 * (2.0*float64(time.Now().UnixNano()%100)/100.0 - 1.0)) - return delay + jitter -} - -// recoverCluster restarts RQLite using the recovery.db created from peers.json -// It reuses launchProcess and waitForReadyAndConnect to ensure all join/backoff logic -// and proper readiness checks are applied during recovery. -func (r *RQLiteManager) recoverCluster(ctx context.Context, peersJSONPath string) error { - r.logger.Info("Initiating cluster recovery by restarting RQLite", - zap.String("peers_file", peersJSONPath)) - - // Stop the current RQLite process - r.logger.Info("Stopping RQLite for recovery") - if err := r.Stop(); err != nil { - r.logger.Warn("Error stopping RQLite", zap.Error(err)) - } - - // Wait for process to fully stop - time.Sleep(2 * time.Second) - - // Get the data directory path - rqliteDataDir, err := r.rqliteDataDirPath() - if err != nil { - return fmt.Errorf("failed to resolve RQLite data directory: %w", err) - } - - // Restart RQLite using launchProcess to ensure all join/backoff logic is applied - // This includes: join address handling, join retries, expect configuration, etc. - r.logger.Info("Restarting RQLite (will auto-recover using peers.json)") - if err := r.launchProcess(ctx, rqliteDataDir); err != nil { - return fmt.Errorf("failed to restart RQLite process: %w", err) - } - - // Wait for RQLite to be ready and establish connection using proper readiness checks - // This includes retries for "store is not open" errors during recovery - if err := r.waitForReadyAndConnect(ctx); err != nil { - // Clean up the process if connection failed - if r.cmd != nil && r.cmd.Process != nil { - _ = r.cmd.Process.Kill() - } - return fmt.Errorf("failed to wait for RQLite readiness after recovery: %w", err) - } - - r.logger.Info("Cluster recovery completed, RQLite restarted with new configuration") - return nil -} - -// checkNeedsClusterRecovery checks if the node has old cluster state that requires coordinated recovery -// Returns true if there are snapshots but the raft log is empty (typical after a crash/restart) -func (r *RQLiteManager) checkNeedsClusterRecovery(rqliteDataDir string) (bool, error) { - // Check for snapshots directory - snapshotsDir := filepath.Join(rqliteDataDir, "rsnapshots") - if _, err := os.Stat(snapshotsDir); os.IsNotExist(err) { - // No snapshots = fresh start, no recovery needed - return false, nil - } - - // Check if snapshots directory has any snapshots - entries, err := os.ReadDir(snapshotsDir) - if err != nil { - return false, fmt.Errorf("failed to read snapshots directory: %w", err) - } - - hasSnapshots := false - for _, entry := range entries { - if entry.IsDir() || strings.HasSuffix(entry.Name(), ".db") { - hasSnapshots = true - break - } - } - - if !hasSnapshots { - // No snapshots = fresh start - return false, nil - } - - // Check raft log size - if it's the default empty size, we need recovery - raftLogPath := filepath.Join(rqliteDataDir, "raft.db") - if info, err := os.Stat(raftLogPath); err == nil { - // Empty or default-sized log with snapshots means we need coordinated recovery - if info.Size() <= 8*1024*1024 { // <= 8MB (default empty log size) - r.logger.Info("Detected cluster recovery situation: snapshots exist but raft log is empty/default size", - zap.String("snapshots_dir", snapshotsDir), - zap.Int64("raft_log_size", info.Size())) - return true, nil - } - } - - return false, nil -} - -// hasExistingRaftState checks if this node has any existing Raft state files -// Returns true if raft.db exists and has content, or if peers.json exists -func (r *RQLiteManager) hasExistingRaftState(rqliteDataDir string) bool { - // Check for raft.db - raftLogPath := filepath.Join(rqliteDataDir, "raft.db") - if info, err := os.Stat(raftLogPath); err == nil { - // If raft.db exists and has meaningful content (> 1KB), we have state - if info.Size() > 1024 { - return true - } - } - - // Check for peers.json - peersPath := filepath.Join(rqliteDataDir, "raft", "peers.json") - if _, err := os.Stat(peersPath); err == nil { - return true - } - - return false -} - -// clearRaftState safely removes Raft state files to allow a clean join -// This removes raft.db and peers.json but preserves db.sqlite -func (r *RQLiteManager) clearRaftState(rqliteDataDir string) error { - r.logger.Warn("Clearing Raft state to allow clean cluster join", - zap.String("data_dir", rqliteDataDir)) - - // Remove raft.db if it exists - raftLogPath := filepath.Join(rqliteDataDir, "raft.db") - if err := os.Remove(raftLogPath); err != nil && !os.IsNotExist(err) { - r.logger.Warn("Failed to remove raft.db", zap.Error(err)) - } else if err == nil { - r.logger.Info("Removed raft.db") - } - - // Remove peers.json if it exists - peersPath := filepath.Join(rqliteDataDir, "raft", "peers.json") - if err := os.Remove(peersPath); err != nil && !os.IsNotExist(err) { - r.logger.Warn("Failed to remove peers.json", zap.Error(err)) - } else if err == nil { - r.logger.Info("Removed peers.json") - } - - // Remove raft directory if it's empty - raftDir := filepath.Join(rqliteDataDir, "raft") - if entries, err := os.ReadDir(raftDir); err == nil && len(entries) == 0 { - if err := os.Remove(raftDir); err != nil { - r.logger.Debug("Failed to remove empty raft directory", zap.Error(err)) - } - } - - r.logger.Info("Raft state cleared successfully - node will join as fresh follower") - return nil -} - -// isInSplitBrainState detects if we're in a split-brain scenario where all nodes -// are followers with no peers (each node thinks it's alone) -func (r *RQLiteManager) isInSplitBrainState() bool { - status, err := r.getRQLiteStatus() - if err != nil { - return false - } - - raft := status.Store.Raft - - // Split-brain indicators: - // - State is Follower (not Leader) - // - Term is 0 (no leader election has occurred) - // - num_peers is 0 (node thinks it's alone) - // - voter is false (node not configured as voter) - isSplitBrain := raft.State == "Follower" && - raft.Term == 0 && - raft.NumPeers == 0 && - !raft.Voter && - raft.LeaderAddr == "" - - if !isSplitBrain { - return false - } - - // Verify all discovered peers are also in split-brain state - if r.discoveryService == nil { - r.logger.Debug("No discovery service to verify split-brain across peers") - return false - } - - peers := r.discoveryService.GetActivePeers() - if len(peers) == 0 { - // No peers discovered yet - might be network issue, not split-brain - return false - } - - // Check if all reachable peers are also in split-brain - splitBrainCount := 0 - reachableCount := 0 - for _, peer := range peers { - if !r.isPeerReachable(peer.HTTPAddress) { - continue - } - reachableCount++ - - peerStatus, err := r.getPeerRQLiteStatus(peer.HTTPAddress) - if err != nil { - continue - } - - peerRaft := peerStatus.Store.Raft - if peerRaft.State == "Follower" && - peerRaft.Term == 0 && - peerRaft.NumPeers == 0 && - !peerRaft.Voter { - splitBrainCount++ - } - } - - // If all reachable peers are in split-brain, we have cluster-wide split-brain - if reachableCount > 0 && splitBrainCount == reachableCount { - r.logger.Warn("Detected cluster-wide split-brain state", - zap.Int("reachable_peers", reachableCount), - zap.Int("split_brain_peers", splitBrainCount)) - return true - } - - return false -} - -// isPeerReachable checks if a peer is at least responding to HTTP requests -func (r *RQLiteManager) isPeerReachable(httpAddr string) bool { - url := fmt.Sprintf("http://%s/status", httpAddr) - client := &http.Client{Timeout: 3 * time.Second} - - resp, err := client.Get(url) - if err != nil { - return false - } - defer resp.Body.Close() - - return resp.StatusCode == http.StatusOK -} - -// getPeerRQLiteStatus queries a peer's status endpoint -func (r *RQLiteManager) getPeerRQLiteStatus(httpAddr string) (*RQLiteStatus, error) { - url := fmt.Sprintf("http://%s/status", httpAddr) - client := &http.Client{Timeout: 3 * time.Second} - - resp, err := client.Get(url) - if err != nil { - return nil, err - } - defer resp.Body.Close() - - if resp.StatusCode != http.StatusOK { - return nil, fmt.Errorf("peer returned status %d", resp.StatusCode) - } - - var status RQLiteStatus - if err := json.NewDecoder(resp.Body).Decode(&status); err != nil { - return nil, err - } - - return &status, nil -} - -// startHealthMonitoring runs periodic health checks and automatically recovers from split-brain -func (r *RQLiteManager) startHealthMonitoring(ctx context.Context) { - // Wait a bit after startup before starting health checks - time.Sleep(30 * time.Second) - - ticker := time.NewTicker(60 * time.Second) // Check every minute - defer ticker.Stop() - - for { - select { - case <-ctx.Done(): - return - case <-ticker.C: - // Check for split-brain state - if r.isInSplitBrainState() { - r.logger.Warn("Split-brain detected during health check, initiating automatic recovery") - - // Attempt automatic recovery - if err := r.recoverFromSplitBrain(ctx); err != nil { - r.logger.Error("Automatic split-brain recovery failed", - zap.Error(err), - zap.String("action", "will retry on next health check")) - } else { - r.logger.Info("Successfully recovered from split-brain") - } - } - } - } -} - -// recoverFromSplitBrain automatically recovers from split-brain state -func (r *RQLiteManager) recoverFromSplitBrain(ctx context.Context) error { - if r.discoveryService == nil { - return fmt.Errorf("discovery service not available for recovery") - } - - r.logger.Info("Starting automatic split-brain recovery") - - // Step 1: Ensure we have latest peer information - r.discoveryService.TriggerPeerExchange(ctx) - time.Sleep(2 * time.Second) - r.discoveryService.TriggerSync() - time.Sleep(2 * time.Second) - - // Step 2: Get data directory - rqliteDataDir, err := r.rqliteDataDirPath() - if err != nil { - return fmt.Errorf("failed to get data directory: %w", err) - } - - // Step 3: Check if peers have more recent data - allPeers := r.discoveryService.GetAllPeers() - maxPeerIndex := uint64(0) - for _, peer := range allPeers { - if peer.NodeID == r.discoverConfig.RaftAdvAddress { - continue // Skip self - } - if peer.RaftLogIndex > maxPeerIndex { - maxPeerIndex = peer.RaftLogIndex - } - } - - // Step 4: Only clear Raft state if this is a completely new node - // CRITICAL: Do NOT clear state for nodes that have existing data - // Raft will handle catch-up automatically via log replication or snapshot installation - ourIndex := r.getRaftLogIndex() - - // Only clear state for truly new nodes (log index 0) joining an existing cluster - // This is the only safe automatic recovery - all other cases should let Raft handle it - isNewNode := ourIndex == 0 && maxPeerIndex > 0 - - if !isNewNode { - r.logger.Info("Split-brain recovery: node has existing data, letting Raft handle catch-up", - zap.Uint64("our_index", ourIndex), - zap.Uint64("peer_max_index", maxPeerIndex), - zap.String("action", "skipping state clear - Raft will sync automatically")) - return nil - } - - r.logger.Info("Split-brain recovery: new node joining cluster - clearing state", - zap.Uint64("our_index", ourIndex), - zap.Uint64("peer_max_index", maxPeerIndex)) - - if err := r.clearRaftState(rqliteDataDir); err != nil { - return fmt.Errorf("failed to clear Raft state: %w", err) - } - - // Step 5: Refresh peer metadata and force write peers.json - // We trigger peer exchange again to ensure we have the absolute latest metadata - // after clearing state, then force write peers.json regardless of changes - r.logger.Info("Refreshing peer metadata after clearing raft state") - r.discoveryService.TriggerPeerExchange(ctx) - time.Sleep(1 * time.Second) // Brief wait for peer exchange to complete - - r.logger.Info("Force writing peers.json with all discovered peers") - // We use ForceWritePeersJSON instead of TriggerSync because TriggerSync - // only writes if membership changed, but after clearing state we need - // to write regardless of changes - if err := r.discoveryService.ForceWritePeersJSON(); err != nil { - return fmt.Errorf("failed to force write peers.json: %w", err) - } - - // Verify peers.json was created - peersPath := filepath.Join(rqliteDataDir, "raft", "peers.json") - if _, err := os.Stat(peersPath); err != nil { - return fmt.Errorf("peers.json not created after force write: %w", err) - } - - r.logger.Info("peers.json verified after force write", - zap.String("peers_path", peersPath)) - - // Step 6: Restart RQLite to pick up new peers.json - r.logger.Info("Restarting RQLite to apply new cluster configuration") - if err := r.recoverCluster(ctx, peersPath); err != nil { - return fmt.Errorf("failed to restart RQLite: %w", err) - } - - // Step 7: Wait for cluster to form (waitForReadyAndConnect already handled readiness) - r.logger.Info("Waiting for cluster to stabilize after recovery...") - time.Sleep(5 * time.Second) - - // Verify recovery succeeded - if r.isInSplitBrainState() { - return fmt.Errorf("still in split-brain after recovery attempt") - } - - r.logger.Info("Split-brain recovery completed successfully") - return nil -} - -// isSafeToClearState verifies we can safely clear Raft state -// Returns true only if peers have higher log indexes (they have more recent data) -// or if we have no meaningful state (index == 0) -func (r *RQLiteManager) isSafeToClearState(rqliteDataDir string) bool { - if r.discoveryService == nil { - r.logger.Debug("No discovery service available, cannot verify safety") - return false // No discovery service, can't verify - } - - ourIndex := r.getRaftLogIndex() - peers := r.discoveryService.GetActivePeers() - - if len(peers) == 0 { - r.logger.Debug("No peers discovered, might be network issue") - return false // No peers, might be network issue - } - - // Find max peer log index - maxPeerIndex := uint64(0) - for _, peer := range peers { - if peer.RaftLogIndex > maxPeerIndex { - maxPeerIndex = peer.RaftLogIndex - } - } - - // Safe to clear if peers have higher log indexes (they have more recent data) - // OR if we have no meaningful state (index == 0) - safe := maxPeerIndex > ourIndex || ourIndex == 0 - - r.logger.Debug("Checking if safe to clear Raft state", - zap.Uint64("our_log_index", ourIndex), - zap.Uint64("peer_max_log_index", maxPeerIndex), - zap.Bool("safe_to_clear", safe)) - - return safe -} - -// performPreStartClusterDiscovery waits for peer discovery and builds a complete peers.json -// before starting RQLite. This ensures all nodes use the same cluster membership for recovery. -func (r *RQLiteManager) performPreStartClusterDiscovery(ctx context.Context, rqliteDataDir string) error { - if r.discoveryService == nil { - r.logger.Warn("No discovery service available, cannot perform pre-start cluster discovery") - return fmt.Errorf("discovery service not available") - } - - r.logger.Info("Waiting for peer discovery to find other cluster members...") - - // CRITICAL: First, actively trigger peer exchange to populate peerstore with RQLite metadata - // The peerstore needs RQLite metadata from other nodes BEFORE we can collect it - r.logger.Info("Triggering peer exchange to collect RQLite metadata from connected peers") - if err := r.discoveryService.TriggerPeerExchange(ctx); err != nil { - r.logger.Warn("Peer exchange failed, continuing anyway", zap.Error(err)) - } - - // Give peer exchange a moment to complete - time.Sleep(1 * time.Second) - - // Now trigger cluster membership sync to populate knownPeers map from the peerstore - r.logger.Info("Triggering initial cluster membership sync to populate peer list") - r.discoveryService.TriggerSync() - - // Give the sync a moment to complete - time.Sleep(2 * time.Second) - - // Wait for peer discovery - give it time to find peers (30 seconds should be enough) - discoveryDeadline := time.Now().Add(30 * time.Second) - var discoveredPeers int - - for time.Now().Before(discoveryDeadline) { - // Check how many peers with RQLite metadata we've discovered - allPeers := r.discoveryService.GetAllPeers() - discoveredPeers = len(allPeers) - - r.logger.Info("Peer discovery progress", - zap.Int("discovered_peers", discoveredPeers), - zap.Duration("time_remaining", time.Until(discoveryDeadline))) - - // If we have at least our minimum cluster size, proceed - if discoveredPeers >= r.config.MinClusterSize { - r.logger.Info("Found minimum cluster size peers, proceeding with recovery", - zap.Int("discovered_peers", discoveredPeers), - zap.Int("min_cluster_size", r.config.MinClusterSize)) - break - } - - // Wait a bit before checking again - time.Sleep(2 * time.Second) - } - - // CRITICAL FIX: Skip recovery if no peers were discovered (other than ourselves) - // Only ourselves in the cluster means this is a fresh cluster, not a recovery scenario - if discoveredPeers <= 1 { - r.logger.Info("No peers discovered during pre-start discovery window - skipping recovery (fresh cluster)", - zap.Int("discovered_peers", discoveredPeers)) - return nil - } - - // AUTOMATIC RECOVERY: Check if we have stale Raft state that conflicts with cluster - // Only clear state if we are a NEW node joining an EXISTING cluster with higher log indexes - // CRITICAL FIX: Do NOT clear state if our log index is the same or similar to peers - // This prevents data loss during normal cluster restarts - allPeers := r.discoveryService.GetAllPeers() - hasExistingState := r.hasExistingRaftState(rqliteDataDir) - - if hasExistingState { - // Get our own log index from persisted snapshots - ourLogIndex := r.getRaftLogIndex() - - // Find the highest log index among other peers (excluding ourselves) - maxPeerIndex := uint64(0) - for _, peer := range allPeers { - // Skip ourselves (compare by raft address) - if peer.NodeID == r.discoverConfig.RaftAdvAddress { - continue - } - if peer.RaftLogIndex > maxPeerIndex { - maxPeerIndex = peer.RaftLogIndex - } - } - - r.logger.Info("Comparing local state with cluster state", - zap.Uint64("our_log_index", ourLogIndex), - zap.Uint64("peer_max_log_index", maxPeerIndex), - zap.String("data_dir", rqliteDataDir)) - - // CRITICAL FIX: Only clear state if this is a COMPLETELY NEW node joining an existing cluster - // - New node: our log index is 0, but peers have data (log index > 0) - // - For all other cases: let Raft handle catch-up via log replication or snapshot installation - // - // WHY THIS IS SAFE: - // - Raft protocol automatically catches up nodes that are behind via AppendEntries - // - If a node is too far behind, the leader will send a snapshot - // - We should NEVER clear state for nodes that have existing data, even if they're behind - // - This prevents data loss during cluster restarts and rolling upgrades - isNewNodeJoiningCluster := ourLogIndex == 0 && maxPeerIndex > 0 - - if isNewNodeJoiningCluster { - r.logger.Warn("New node joining existing cluster - clearing local state to allow clean join", - zap.Uint64("our_log_index", ourLogIndex), - zap.Uint64("peer_max_log_index", maxPeerIndex), - zap.String("data_dir", rqliteDataDir)) - - if err := r.clearRaftState(rqliteDataDir); err != nil { - r.logger.Error("Failed to clear Raft state", zap.Error(err)) - } else { - // Force write peers.json after clearing state - if r.discoveryService != nil { - r.logger.Info("Force writing peers.json after clearing local state") - if err := r.discoveryService.ForceWritePeersJSON(); err != nil { - r.logger.Error("Failed to force write peers.json after clearing state", zap.Error(err)) - } - } - } - } else { - r.logger.Info("Preserving Raft state - node will catch up via Raft protocol", - zap.Uint64("our_log_index", ourLogIndex), - zap.Uint64("peer_max_log_index", maxPeerIndex)) - } - } - - // Trigger final sync to ensure peers.json is up to date with latest discovered peers - r.logger.Info("Triggering final cluster membership sync to build complete peers.json") - r.discoveryService.TriggerSync() - - // Wait a moment for the sync to complete - time.Sleep(2 * time.Second) - - // Verify peers.json was created - peersPath := filepath.Join(rqliteDataDir, "raft", "peers.json") - if _, err := os.Stat(peersPath); err != nil { - return fmt.Errorf("peers.json was not created after discovery: %w", err) - } - - r.logger.Info("Pre-start cluster discovery completed successfully", - zap.String("peers_file", peersPath), - zap.Int("peer_count", discoveredPeers)) - - return nil -} - -// validateNodeID checks that rqlite's reported node ID matches our configured raft address -func (r *RQLiteManager) validateNodeID() error { - // Query /nodes endpoint to get our node ID - // Retry a few times as the endpoint might not be ready immediately - for i := 0; i < 5; i++ { - nodes, err := r.getRQLiteNodes() - if err != nil { - // If endpoint is not ready yet, wait and retry - if i < 4 { - time.Sleep(500 * time.Millisecond) - continue - } - // Log at debug level if validation fails - not critical - r.logger.Debug("Node ID validation skipped (endpoint unavailable)", zap.Error(err)) - return nil - } - - expectedID := r.discoverConfig.RaftAdvAddress - if expectedID == "" { - return fmt.Errorf("raft_adv_address not configured") - } - - // If cluster is still forming, nodes list might be empty - that's okay - if len(nodes) == 0 { - r.logger.Debug("Node ID validation skipped (cluster not yet formed)") - return nil - } - - // Find our node in the cluster (match by address) - for _, node := range nodes { - if node.Address == expectedID { - if node.ID != expectedID { - r.logger.Error("CRITICAL: RQLite node ID mismatch", - zap.String("configured_raft_address", expectedID), - zap.String("rqlite_node_id", node.ID), - zap.String("rqlite_node_address", node.Address), - zap.String("explanation", "peers.json id field must match rqlite's node ID (raft address)")) - return fmt.Errorf("node ID mismatch: configured %s but rqlite reports %s", expectedID, node.ID) - } - r.logger.Debug("Node ID validation passed", - zap.String("node_id", node.ID), - zap.String("address", node.Address)) - return nil - } - } - - // If we can't find ourselves but other nodes exist, cluster might still be forming - // This is fine - don't log a warning - r.logger.Debug("Node ID validation skipped (node not yet in cluster membership)", - zap.String("expected_address", expectedID), - zap.Int("nodes_in_cluster", len(nodes))) - return nil - } - - return nil -} diff --git a/pkg/rqlite/util.go b/pkg/rqlite/util.go new file mode 100644 index 0000000..01360cc --- /dev/null +++ b/pkg/rqlite/util.go @@ -0,0 +1,58 @@ +package rqlite + +import ( + "os" + "path/filepath" + "strings" + "time" +) + +func (r *RQLiteManager) rqliteDataDirPath() (string, error) { + dataDir := os.ExpandEnv(r.dataDir) + if strings.HasPrefix(dataDir, "~") { + home, _ := os.UserHomeDir() + dataDir = filepath.Join(home, dataDir[1:]) + } + return filepath.Join(dataDir, "rqlite"), nil +} + +func (r *RQLiteManager) resolveMigrationsDir() (string, error) { + productionPath := "/home/debros/src/migrations" + if _, err := os.Stat(productionPath); err == nil { + return productionPath, nil + } + return "migrations", nil +} + +func (r *RQLiteManager) prepareDataDir() (string, error) { + rqliteDataDir, err := r.rqliteDataDirPath() + if err != nil { + return "", err + } + if err := os.MkdirAll(rqliteDataDir, 0755); err != nil { + return "", err + } + return rqliteDataDir, nil +} + +func (r *RQLiteManager) hasExistingState(rqliteDataDir string) bool { + entries, err := os.ReadDir(rqliteDataDir) + if err != nil { + return false + } + for _, e := range entries { + if e.Name() != "." && e.Name() != ".." { + return true + } + } + return false +} + +func (r *RQLiteManager) exponentialBackoff(attempt int, baseDelay time.Duration, maxDelay time.Duration) time.Duration { + delay := baseDelay * time.Duration(1< maxDelay { + delay = maxDelay + } + return delay +} +