anonpenguin23 0388c3a766
refactor: streamline development and production command structure
- Consolidated development commands into a new `dev` command group for better organization.
- Introduced a `prod` command group to manage production environment operations.
- Updated Makefile to simplify the development environment setup and improve logging.
- Enhanced README to clarify the development process and health check requirements.
- Removed deprecated configuration and service management commands to streamline the CLI interface.
2025-11-09 18:28:24 +02:00

218 lines
6.4 KiB
Go
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package development
import (
"context"
"encoding/json"
"fmt"
"io"
"net/http"
"os/exec"
"strings"
"time"
)
// HealthCheckResult represents the result of a health check
type HealthCheckResult struct {
Name string
Healthy bool
Details string
}
// IPFSHealthCheck verifies IPFS peer connectivity
func (pm *ProcessManager) IPFSHealthCheck(ctx context.Context, nodes []ipfsNodeInfo) HealthCheckResult {
result := HealthCheckResult{Name: "IPFS Peers"}
healthyCount := 0
for _, node := range nodes {
cmd := exec.CommandContext(ctx, "ipfs", "swarm", "peers", "--repo-dir="+node.ipfsPath)
output, err := cmd.CombinedOutput()
if err != nil {
result.Details += fmt.Sprintf("%s: error getting peers (%v); ", node.name, err)
continue
}
// Split by newlines and filter empty lines
peerLines := strings.Split(strings.TrimSpace(string(output)), "\n")
peerCount := 0
for _, line := range peerLines {
if strings.TrimSpace(line) != "" {
peerCount++
}
}
if peerCount < 2 {
result.Details += fmt.Sprintf("%s: only %d peers (want 2+); ", node.name, peerCount)
} else {
result.Details += fmt.Sprintf("%s: %d peers; ", node.name, peerCount)
healthyCount++
}
}
result.Healthy = healthyCount == len(nodes)
return result
}
// RQLiteHealthCheck verifies RQLite cluster formation
func (pm *ProcessManager) RQLiteHealthCheck(ctx context.Context) HealthCheckResult {
result := HealthCheckResult{Name: "RQLite Cluster"}
// Check bootstrap node
bootstrapStatus := pm.checkRQLiteNode(ctx, "bootstrap", 5001)
if !bootstrapStatus.Healthy {
result.Details += fmt.Sprintf("bootstrap: %s; ", bootstrapStatus.Details)
return result
}
// Check node2 and node3
node2Status := pm.checkRQLiteNode(ctx, "node2", 5002)
node3Status := pm.checkRQLiteNode(ctx, "node3", 5003)
if node2Status.Healthy && node3Status.Healthy {
result.Healthy = true
result.Details = fmt.Sprintf("bootstrap: leader ok; node2: %s; node3: %s", node2Status.Details, node3Status.Details)
} else {
result.Details = fmt.Sprintf("bootstrap: ok; node2: %s; node3: %s", node2Status.Details, node3Status.Details)
}
return result
}
// checkRQLiteNode queries a single RQLite node's status
func (pm *ProcessManager) checkRQLiteNode(ctx context.Context, name string, httpPort int) HealthCheckResult {
result := HealthCheckResult{Name: fmt.Sprintf("RQLite-%s", name)}
urlStr := fmt.Sprintf("http://localhost:%d/status", httpPort)
client := &http.Client{Timeout: 2 * time.Second}
resp, err := client.Get(urlStr)
if err != nil {
result.Details = fmt.Sprintf("connection failed: %v", err)
return result
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
result.Details = fmt.Sprintf("HTTP %d", resp.StatusCode)
return result
}
var status map[string]interface{}
if err := json.NewDecoder(resp.Body).Decode(&status); err != nil {
result.Details = fmt.Sprintf("decode error: %v", err)
return result
}
// Check the store.raft structure (RQLite 8 format)
store, ok := status["store"].(map[string]interface{})
if !ok {
result.Details = "store data not found"
return result
}
raft, ok := store["raft"].(map[string]interface{})
if !ok {
result.Details = "raft data not found"
return result
}
// Check if we have a leader
leader, hasLeader := raft["leader"].(string)
if hasLeader && leader != "" {
result.Healthy = true
result.Details = "cluster member with leader elected"
return result
}
// Check node state - accept both Leader and Follower
if state, ok := raft["state"].(string); ok {
if state == "Leader" {
result.Healthy = true
result.Details = "this node is leader"
return result
}
if state == "Follower" {
result.Healthy = true
result.Details = "this node is follower in cluster"
return result
}
result.Details = fmt.Sprintf("state: %s", state)
return result
}
result.Details = "not yet connected"
return result
}
// LibP2PHealthCheck verifies that network nodes have peer connections
func (pm *ProcessManager) LibP2PHealthCheck(ctx context.Context) HealthCheckResult {
result := HealthCheckResult{Name: "LibP2P/Node Peers"}
// Check that at least 2 nodes are part of the RQLite cluster (implies peer connectivity)
// and that they can communicate via LibP2P (which they use for cluster discovery)
healthyNodes := 0
for i, name := range []string{"bootstrap", "node2", "node3"} {
httpPort := 5001 + i
status := pm.checkRQLiteNode(ctx, name, httpPort)
if status.Healthy {
healthyNodes++
result.Details += fmt.Sprintf("%s: connected; ", name)
} else {
result.Details += fmt.Sprintf("%s: %s; ", name, status.Details)
}
}
// Healthy if at least 2 nodes report connectivity (including bootstrap)
result.Healthy = healthyNodes >= 2
return result
}
// HealthCheckWithRetry performs a health check with retry logic
func (pm *ProcessManager) HealthCheckWithRetry(ctx context.Context, nodes []ipfsNodeInfo, retries int, retryInterval time.Duration, timeout time.Duration) bool {
fmt.Fprintf(pm.logWriter, "\n⚕ Validating cluster health...\n")
deadlineCtx, cancel := context.WithTimeout(ctx, timeout)
defer cancel()
for attempt := 1; attempt <= retries; attempt++ {
// Perform all checks
ipfsResult := pm.IPFSHealthCheck(deadlineCtx, nodes)
rqliteResult := pm.RQLiteHealthCheck(deadlineCtx)
libp2pResult := pm.LibP2PHealthCheck(deadlineCtx)
// Log results
if attempt == 1 || attempt == retries || (attempt%3 == 0) {
fmt.Fprintf(pm.logWriter, " Attempt %d/%d:\n", attempt, retries)
pm.logHealthCheckResult(pm.logWriter, " ", ipfsResult)
pm.logHealthCheckResult(pm.logWriter, " ", rqliteResult)
pm.logHealthCheckResult(pm.logWriter, " ", libp2pResult)
}
// All checks must pass
if ipfsResult.Healthy && rqliteResult.Healthy && libp2pResult.Healthy {
fmt.Fprintf(pm.logWriter, "\n✓ All health checks passed!\n")
return true
}
if attempt < retries {
select {
case <-time.After(retryInterval):
continue
case <-deadlineCtx.Done():
fmt.Fprintf(pm.logWriter, "\n❌ Health check timeout reached\n")
return false
}
}
}
fmt.Fprintf(pm.logWriter, "\n❌ Health checks failed after %d attempts\n", retries)
return false
}
// logHealthCheckResult logs a single health check result
func (pm *ProcessManager) logHealthCheckResult(w io.Writer, indent string, result HealthCheckResult) {
status := "❌"
if result.Healthy {
status = "✓"
}
fmt.Fprintf(w, "%s%s %s: %s\n", indent, status, result.Name, result.Details)
}