mirror of
https://github.com/DeBrosOfficial/network.git
synced 2025-12-11 10:18:50 +00:00
- Consolidated development commands into a new `dev` command group for better organization. - Introduced a `prod` command group to manage production environment operations. - Updated Makefile to simplify the development environment setup and improve logging. - Enhanced README to clarify the development process and health check requirements. - Removed deprecated configuration and service management commands to streamline the CLI interface.
218 lines
6.4 KiB
Go
218 lines
6.4 KiB
Go
package development
|
||
|
||
import (
|
||
"context"
|
||
"encoding/json"
|
||
"fmt"
|
||
"io"
|
||
"net/http"
|
||
"os/exec"
|
||
"strings"
|
||
"time"
|
||
)
|
||
|
||
// HealthCheckResult represents the result of a health check
|
||
type HealthCheckResult struct {
|
||
Name string
|
||
Healthy bool
|
||
Details string
|
||
}
|
||
|
||
// IPFSHealthCheck verifies IPFS peer connectivity
|
||
func (pm *ProcessManager) IPFSHealthCheck(ctx context.Context, nodes []ipfsNodeInfo) HealthCheckResult {
|
||
result := HealthCheckResult{Name: "IPFS Peers"}
|
||
|
||
healthyCount := 0
|
||
for _, node := range nodes {
|
||
cmd := exec.CommandContext(ctx, "ipfs", "swarm", "peers", "--repo-dir="+node.ipfsPath)
|
||
output, err := cmd.CombinedOutput()
|
||
if err != nil {
|
||
result.Details += fmt.Sprintf("%s: error getting peers (%v); ", node.name, err)
|
||
continue
|
||
}
|
||
|
||
// Split by newlines and filter empty lines
|
||
peerLines := strings.Split(strings.TrimSpace(string(output)), "\n")
|
||
peerCount := 0
|
||
for _, line := range peerLines {
|
||
if strings.TrimSpace(line) != "" {
|
||
peerCount++
|
||
}
|
||
}
|
||
|
||
if peerCount < 2 {
|
||
result.Details += fmt.Sprintf("%s: only %d peers (want 2+); ", node.name, peerCount)
|
||
} else {
|
||
result.Details += fmt.Sprintf("%s: %d peers; ", node.name, peerCount)
|
||
healthyCount++
|
||
}
|
||
}
|
||
|
||
result.Healthy = healthyCount == len(nodes)
|
||
return result
|
||
}
|
||
|
||
// RQLiteHealthCheck verifies RQLite cluster formation
|
||
func (pm *ProcessManager) RQLiteHealthCheck(ctx context.Context) HealthCheckResult {
|
||
result := HealthCheckResult{Name: "RQLite Cluster"}
|
||
|
||
// Check bootstrap node
|
||
bootstrapStatus := pm.checkRQLiteNode(ctx, "bootstrap", 5001)
|
||
if !bootstrapStatus.Healthy {
|
||
result.Details += fmt.Sprintf("bootstrap: %s; ", bootstrapStatus.Details)
|
||
return result
|
||
}
|
||
|
||
// Check node2 and node3
|
||
node2Status := pm.checkRQLiteNode(ctx, "node2", 5002)
|
||
node3Status := pm.checkRQLiteNode(ctx, "node3", 5003)
|
||
|
||
if node2Status.Healthy && node3Status.Healthy {
|
||
result.Healthy = true
|
||
result.Details = fmt.Sprintf("bootstrap: leader ok; node2: %s; node3: %s", node2Status.Details, node3Status.Details)
|
||
} else {
|
||
result.Details = fmt.Sprintf("bootstrap: ok; node2: %s; node3: %s", node2Status.Details, node3Status.Details)
|
||
}
|
||
|
||
return result
|
||
}
|
||
|
||
// checkRQLiteNode queries a single RQLite node's status
|
||
func (pm *ProcessManager) checkRQLiteNode(ctx context.Context, name string, httpPort int) HealthCheckResult {
|
||
result := HealthCheckResult{Name: fmt.Sprintf("RQLite-%s", name)}
|
||
|
||
urlStr := fmt.Sprintf("http://localhost:%d/status", httpPort)
|
||
client := &http.Client{Timeout: 2 * time.Second}
|
||
resp, err := client.Get(urlStr)
|
||
if err != nil {
|
||
result.Details = fmt.Sprintf("connection failed: %v", err)
|
||
return result
|
||
}
|
||
defer resp.Body.Close()
|
||
|
||
if resp.StatusCode != http.StatusOK {
|
||
result.Details = fmt.Sprintf("HTTP %d", resp.StatusCode)
|
||
return result
|
||
}
|
||
|
||
var status map[string]interface{}
|
||
if err := json.NewDecoder(resp.Body).Decode(&status); err != nil {
|
||
result.Details = fmt.Sprintf("decode error: %v", err)
|
||
return result
|
||
}
|
||
|
||
// Check the store.raft structure (RQLite 8 format)
|
||
store, ok := status["store"].(map[string]interface{})
|
||
if !ok {
|
||
result.Details = "store data not found"
|
||
return result
|
||
}
|
||
|
||
raft, ok := store["raft"].(map[string]interface{})
|
||
if !ok {
|
||
result.Details = "raft data not found"
|
||
return result
|
||
}
|
||
|
||
// Check if we have a leader
|
||
leader, hasLeader := raft["leader"].(string)
|
||
if hasLeader && leader != "" {
|
||
result.Healthy = true
|
||
result.Details = "cluster member with leader elected"
|
||
return result
|
||
}
|
||
|
||
// Check node state - accept both Leader and Follower
|
||
if state, ok := raft["state"].(string); ok {
|
||
if state == "Leader" {
|
||
result.Healthy = true
|
||
result.Details = "this node is leader"
|
||
return result
|
||
}
|
||
if state == "Follower" {
|
||
result.Healthy = true
|
||
result.Details = "this node is follower in cluster"
|
||
return result
|
||
}
|
||
result.Details = fmt.Sprintf("state: %s", state)
|
||
return result
|
||
}
|
||
|
||
result.Details = "not yet connected"
|
||
return result
|
||
}
|
||
|
||
// LibP2PHealthCheck verifies that network nodes have peer connections
|
||
func (pm *ProcessManager) LibP2PHealthCheck(ctx context.Context) HealthCheckResult {
|
||
result := HealthCheckResult{Name: "LibP2P/Node Peers"}
|
||
|
||
// Check that at least 2 nodes are part of the RQLite cluster (implies peer connectivity)
|
||
// and that they can communicate via LibP2P (which they use for cluster discovery)
|
||
healthyNodes := 0
|
||
for i, name := range []string{"bootstrap", "node2", "node3"} {
|
||
httpPort := 5001 + i
|
||
status := pm.checkRQLiteNode(ctx, name, httpPort)
|
||
if status.Healthy {
|
||
healthyNodes++
|
||
result.Details += fmt.Sprintf("%s: connected; ", name)
|
||
} else {
|
||
result.Details += fmt.Sprintf("%s: %s; ", name, status.Details)
|
||
}
|
||
}
|
||
|
||
// Healthy if at least 2 nodes report connectivity (including bootstrap)
|
||
result.Healthy = healthyNodes >= 2
|
||
return result
|
||
}
|
||
|
||
// HealthCheckWithRetry performs a health check with retry logic
|
||
func (pm *ProcessManager) HealthCheckWithRetry(ctx context.Context, nodes []ipfsNodeInfo, retries int, retryInterval time.Duration, timeout time.Duration) bool {
|
||
fmt.Fprintf(pm.logWriter, "\n⚕️ Validating cluster health...\n")
|
||
|
||
deadlineCtx, cancel := context.WithTimeout(ctx, timeout)
|
||
defer cancel()
|
||
|
||
for attempt := 1; attempt <= retries; attempt++ {
|
||
// Perform all checks
|
||
ipfsResult := pm.IPFSHealthCheck(deadlineCtx, nodes)
|
||
rqliteResult := pm.RQLiteHealthCheck(deadlineCtx)
|
||
libp2pResult := pm.LibP2PHealthCheck(deadlineCtx)
|
||
|
||
// Log results
|
||
if attempt == 1 || attempt == retries || (attempt%3 == 0) {
|
||
fmt.Fprintf(pm.logWriter, " Attempt %d/%d:\n", attempt, retries)
|
||
pm.logHealthCheckResult(pm.logWriter, " ", ipfsResult)
|
||
pm.logHealthCheckResult(pm.logWriter, " ", rqliteResult)
|
||
pm.logHealthCheckResult(pm.logWriter, " ", libp2pResult)
|
||
}
|
||
|
||
// All checks must pass
|
||
if ipfsResult.Healthy && rqliteResult.Healthy && libp2pResult.Healthy {
|
||
fmt.Fprintf(pm.logWriter, "\n✓ All health checks passed!\n")
|
||
return true
|
||
}
|
||
|
||
if attempt < retries {
|
||
select {
|
||
case <-time.After(retryInterval):
|
||
continue
|
||
case <-deadlineCtx.Done():
|
||
fmt.Fprintf(pm.logWriter, "\n❌ Health check timeout reached\n")
|
||
return false
|
||
}
|
||
}
|
||
}
|
||
|
||
fmt.Fprintf(pm.logWriter, "\n❌ Health checks failed after %d attempts\n", retries)
|
||
return false
|
||
}
|
||
|
||
// logHealthCheckResult logs a single health check result
|
||
func (pm *ProcessManager) logHealthCheckResult(w io.Writer, indent string, result HealthCheckResult) {
|
||
status := "❌"
|
||
if result.Healthy {
|
||
status = "✓"
|
||
}
|
||
fmt.Fprintf(w, "%s%s %s: %s\n", indent, status, result.Name, result.Details)
|
||
}
|