mirror of
https://github.com/DeBrosOfficial/network.git
synced 2026-01-30 21:33:02 +00:00
272 lines
6.6 KiB
Go
272 lines
6.6 KiB
Go
package health
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"net/http"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/DeBrosOfficial/network/pkg/database"
|
|
"go.uber.org/zap"
|
|
)
|
|
|
|
// deploymentRow represents a deployment record for health checking
|
|
type deploymentRow struct {
|
|
ID string `db:"id"`
|
|
Namespace string `db:"namespace"`
|
|
Name string `db:"name"`
|
|
Type string `db:"type"`
|
|
Port int `db:"port"`
|
|
HealthCheckPath string `db:"health_check_path"`
|
|
HomeNodeID string `db:"home_node_id"`
|
|
}
|
|
|
|
// HealthChecker monitors deployment health
|
|
type HealthChecker struct {
|
|
db database.Database
|
|
logger *zap.Logger
|
|
workers int
|
|
mu sync.RWMutex
|
|
active map[string]bool // deployment_id -> is_active
|
|
}
|
|
|
|
// NewHealthChecker creates a new health checker
|
|
func NewHealthChecker(db database.Database, logger *zap.Logger) *HealthChecker {
|
|
return &HealthChecker{
|
|
db: db,
|
|
logger: logger,
|
|
workers: 10,
|
|
active: make(map[string]bool),
|
|
}
|
|
}
|
|
|
|
// Start begins health monitoring
|
|
func (hc *HealthChecker) Start(ctx context.Context) error {
|
|
hc.logger.Info("Starting health checker", zap.Int("workers", hc.workers))
|
|
|
|
ticker := time.NewTicker(30 * time.Second)
|
|
defer ticker.Stop()
|
|
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
hc.logger.Info("Health checker stopped")
|
|
return ctx.Err()
|
|
case <-ticker.C:
|
|
if err := hc.checkAllDeployments(ctx); err != nil {
|
|
hc.logger.Error("Health check cycle failed", zap.Error(err))
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// checkAllDeployments checks all active deployments
|
|
func (hc *HealthChecker) checkAllDeployments(ctx context.Context) error {
|
|
var rows []deploymentRow
|
|
query := `
|
|
SELECT id, namespace, name, type, port, health_check_path, home_node_id
|
|
FROM deployments
|
|
WHERE status = 'active' AND type IN ('nextjs', 'nodejs-backend', 'go-backend')
|
|
`
|
|
|
|
err := hc.db.Query(ctx, &rows, query)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to query deployments: %w", err)
|
|
}
|
|
|
|
hc.logger.Info("Checking deployments", zap.Int("count", len(rows)))
|
|
|
|
// Process in parallel
|
|
sem := make(chan struct{}, hc.workers)
|
|
var wg sync.WaitGroup
|
|
|
|
for _, row := range rows {
|
|
wg.Add(1)
|
|
go func(r deploymentRow) {
|
|
defer wg.Done()
|
|
sem <- struct{}{}
|
|
defer func() { <-sem }()
|
|
|
|
healthy := hc.checkDeployment(ctx, r)
|
|
hc.recordHealthCheck(ctx, r.ID, healthy)
|
|
}(row)
|
|
}
|
|
|
|
wg.Wait()
|
|
return nil
|
|
}
|
|
|
|
// checkDeployment checks a single deployment
|
|
func (hc *HealthChecker) checkDeployment(ctx context.Context, dep deploymentRow) bool {
|
|
if dep.Port == 0 {
|
|
// Static deployments are always healthy
|
|
return true
|
|
}
|
|
|
|
// Check local port
|
|
url := fmt.Sprintf("http://localhost:%d%s", dep.Port, dep.HealthCheckPath)
|
|
|
|
checkCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
|
|
defer cancel()
|
|
|
|
req, err := http.NewRequestWithContext(checkCtx, "GET", url, nil)
|
|
if err != nil {
|
|
hc.logger.Error("Failed to create health check request",
|
|
zap.String("deployment", dep.Name),
|
|
zap.Error(err),
|
|
)
|
|
return false
|
|
}
|
|
|
|
client := &http.Client{Timeout: 5 * time.Second}
|
|
resp, err := client.Do(req)
|
|
if err != nil {
|
|
hc.logger.Warn("Health check failed",
|
|
zap.String("deployment", dep.Name),
|
|
zap.String("namespace", dep.Namespace),
|
|
zap.String("url", url),
|
|
zap.Error(err),
|
|
)
|
|
return false
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
healthy := resp.StatusCode >= 200 && resp.StatusCode < 300
|
|
|
|
if !healthy {
|
|
hc.logger.Warn("Health check returned unhealthy status",
|
|
zap.String("deployment", dep.Name),
|
|
zap.Int("status", resp.StatusCode),
|
|
)
|
|
}
|
|
|
|
return healthy
|
|
}
|
|
|
|
// recordHealthCheck records the health check result
|
|
func (hc *HealthChecker) recordHealthCheck(ctx context.Context, deploymentID string, healthy bool) {
|
|
status := "healthy"
|
|
if !healthy {
|
|
status = "unhealthy"
|
|
}
|
|
|
|
query := `
|
|
INSERT INTO deployment_health_checks (deployment_id, status, checked_at, response_time_ms)
|
|
VALUES (?, ?, ?, ?)
|
|
`
|
|
|
|
_, err := hc.db.Exec(ctx, query, deploymentID, status, time.Now(), 0)
|
|
if err != nil {
|
|
hc.logger.Error("Failed to record health check",
|
|
zap.String("deployment", deploymentID),
|
|
zap.Error(err),
|
|
)
|
|
}
|
|
|
|
// Track consecutive failures
|
|
hc.checkConsecutiveFailures(ctx, deploymentID, healthy)
|
|
}
|
|
|
|
// checkConsecutiveFailures marks deployment as failed after 3 consecutive failures
|
|
func (hc *HealthChecker) checkConsecutiveFailures(ctx context.Context, deploymentID string, currentHealthy bool) {
|
|
if currentHealthy {
|
|
return
|
|
}
|
|
|
|
type healthRow struct {
|
|
Status string `db:"status"`
|
|
}
|
|
|
|
var rows []healthRow
|
|
query := `
|
|
SELECT status
|
|
FROM deployment_health_checks
|
|
WHERE deployment_id = ?
|
|
ORDER BY checked_at DESC
|
|
LIMIT 3
|
|
`
|
|
|
|
err := hc.db.Query(ctx, &rows, query, deploymentID)
|
|
if err != nil {
|
|
hc.logger.Error("Failed to query health history", zap.Error(err))
|
|
return
|
|
}
|
|
|
|
// Check if last 3 checks all failed
|
|
if len(rows) >= 3 {
|
|
allFailed := true
|
|
for _, row := range rows {
|
|
if row.Status != "unhealthy" {
|
|
allFailed = false
|
|
break
|
|
}
|
|
}
|
|
|
|
if allFailed {
|
|
hc.logger.Error("Deployment has 3 consecutive failures, marking as failed",
|
|
zap.String("deployment", deploymentID),
|
|
)
|
|
|
|
updateQuery := `
|
|
UPDATE deployments
|
|
SET status = 'failed', updated_at = ?
|
|
WHERE id = ?
|
|
`
|
|
|
|
_, err := hc.db.Exec(ctx, updateQuery, time.Now(), deploymentID)
|
|
if err != nil {
|
|
hc.logger.Error("Failed to mark deployment as failed", zap.Error(err))
|
|
}
|
|
|
|
// Record event
|
|
eventQuery := `
|
|
INSERT INTO deployment_events (deployment_id, event_type, message, created_at)
|
|
VALUES (?, 'health_failed', 'Deployment marked as failed after 3 consecutive health check failures', ?)
|
|
`
|
|
hc.db.Exec(ctx, eventQuery, deploymentID, time.Now())
|
|
}
|
|
}
|
|
}
|
|
|
|
// GetHealthStatus gets recent health checks for a deployment
|
|
func (hc *HealthChecker) GetHealthStatus(ctx context.Context, deploymentID string, limit int) ([]HealthCheck, error) {
|
|
type healthRow struct {
|
|
Status string `db:"status"`
|
|
CheckedAt time.Time `db:"checked_at"`
|
|
ResponseTimeMs int `db:"response_time_ms"`
|
|
}
|
|
|
|
var rows []healthRow
|
|
query := `
|
|
SELECT status, checked_at, response_time_ms
|
|
FROM deployment_health_checks
|
|
WHERE deployment_id = ?
|
|
ORDER BY checked_at DESC
|
|
LIMIT ?
|
|
`
|
|
|
|
err := hc.db.Query(ctx, &rows, query, deploymentID, limit)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
checks := make([]HealthCheck, len(rows))
|
|
for i, row := range rows {
|
|
checks[i] = HealthCheck{
|
|
Status: row.Status,
|
|
CheckedAt: row.CheckedAt,
|
|
ResponseTimeMs: row.ResponseTimeMs,
|
|
}
|
|
}
|
|
|
|
return checks, nil
|
|
}
|
|
|
|
// HealthCheck represents a health check result
|
|
type HealthCheck struct {
|
|
Status string `json:"status"`
|
|
CheckedAt time.Time `json:"checked_at"`
|
|
ResponseTimeMs int `json:"response_time_ms"`
|
|
}
|