mirror of
https://github.com/DeBrosOfficial/orama.git
synced 2026-03-17 06:23:00 +00:00
feat: update version to 0.110.0 and enhance dead node recovery handling with deployment status updates
This commit is contained in:
parent
8cabe48f7d
commit
c731486454
2
Makefile
2
Makefile
@ -63,7 +63,7 @@ test-e2e-quick:
|
|||||||
|
|
||||||
.PHONY: build clean test deps tidy fmt vet lint install-hooks redeploy-devnet redeploy-testnet release health
|
.PHONY: build clean test deps tidy fmt vet lint install-hooks redeploy-devnet redeploy-testnet release health
|
||||||
|
|
||||||
VERSION := 0.109.0
|
VERSION := 0.110.0
|
||||||
COMMIT ?= $(shell git rev-parse --short HEAD 2>/dev/null || echo unknown)
|
COMMIT ?= $(shell git rev-parse --short HEAD 2>/dev/null || echo unknown)
|
||||||
DATE ?= $(shell date -u +%Y-%m-%dT%H:%M:%SZ)
|
DATE ?= $(shell date -u +%Y-%m-%dT%H:%M:%SZ)
|
||||||
LDFLAGS := -X 'main.version=$(VERSION)' -X 'main.commit=$(COMMIT)' -X 'main.date=$(DATE)'
|
LDFLAGS := -X 'main.version=$(VERSION)' -X 'main.commit=$(COMMIT)' -X 'main.date=$(DATE)'
|
||||||
|
|||||||
@ -34,8 +34,9 @@ type survivingNodePorts struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// HandleDeadNode processes the death of a network node by recovering all affected
|
// HandleDeadNode processes the death of a network node by recovering all affected
|
||||||
// namespace clusters. It finds all clusters with assignments to the dead node and
|
// namespace clusters and deployment replicas. It marks all deployment replicas on
|
||||||
// replaces it with a healthy node in each cluster.
|
// the dead node as failed, updates deployment statuses, and replaces namespace
|
||||||
|
// cluster nodes.
|
||||||
func (cm *ClusterManager) HandleDeadNode(ctx context.Context, deadNodeID string) {
|
func (cm *ClusterManager) HandleDeadNode(ctx context.Context, deadNodeID string) {
|
||||||
cm.logger.Error("Handling dead node — starting recovery",
|
cm.logger.Error("Handling dead node — starting recovery",
|
||||||
zap.String("dead_node", deadNodeID),
|
zap.String("dead_node", deadNodeID),
|
||||||
@ -46,6 +47,11 @@ func (cm *ClusterManager) HandleDeadNode(ctx context.Context, deadNodeID string)
|
|||||||
cm.logger.Warn("Failed to mark node offline", zap.Error(err))
|
cm.logger.Warn("Failed to mark node offline", zap.Error(err))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Mark all deployment replicas on the dead node as failed.
|
||||||
|
// This must happen before namespace recovery so routing immediately
|
||||||
|
// excludes the dead node — no relying on circuit breakers to discover it.
|
||||||
|
cm.markDeadNodeReplicasFailed(ctx, deadNodeID)
|
||||||
|
|
||||||
// Find all affected clusters
|
// Find all affected clusters
|
||||||
clusters, err := cm.getClustersByNodeID(ctx, deadNodeID)
|
clusters, err := cm.getClustersByNodeID(ctx, deadNodeID)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@ -1087,3 +1093,83 @@ func (cm *ClusterManager) addNodeToCluster(
|
|||||||
|
|
||||||
return replacement, portBlock, nil
|
return replacement, portBlock, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// markDeadNodeReplicasFailed marks all deployment replicas on a dead node as
|
||||||
|
// 'failed' and recalculates each affected deployment's status. This ensures
|
||||||
|
// routing immediately excludes the dead node instead of discovering it's
|
||||||
|
// unreachable through timeouts.
|
||||||
|
func (cm *ClusterManager) markDeadNodeReplicasFailed(ctx context.Context, deadNodeID string) {
|
||||||
|
// Find all active deployment replicas on the dead node.
|
||||||
|
type affectedReplica struct {
|
||||||
|
DeploymentID string `db:"deployment_id"`
|
||||||
|
}
|
||||||
|
var affected []affectedReplica
|
||||||
|
findQuery := `SELECT DISTINCT deployment_id FROM deployment_replicas WHERE node_id = ? AND status = 'active'`
|
||||||
|
if err := cm.db.Query(ctx, &affected, findQuery, deadNodeID); err != nil {
|
||||||
|
cm.logger.Warn("Failed to query deployment replicas for dead node",
|
||||||
|
zap.String("dead_node", deadNodeID), zap.Error(err))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(affected) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
cm.logger.Info("Marking deployment replicas on dead node as failed",
|
||||||
|
zap.String("dead_node", deadNodeID),
|
||||||
|
zap.Int("replica_count", len(affected)),
|
||||||
|
)
|
||||||
|
|
||||||
|
// Mark all replicas on the dead node as failed in a single UPDATE.
|
||||||
|
markQuery := `UPDATE deployment_replicas SET status = 'failed' WHERE node_id = ? AND status = 'active'`
|
||||||
|
if _, err := cm.db.Exec(ctx, markQuery, deadNodeID); err != nil {
|
||||||
|
cm.logger.Error("Failed to mark deployment replicas as failed",
|
||||||
|
zap.String("dead_node", deadNodeID), zap.Error(err))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Recalculate each affected deployment's status based on remaining active replicas.
|
||||||
|
type replicaCount struct {
|
||||||
|
Count int `db:"count"`
|
||||||
|
}
|
||||||
|
now := time.Now().Format("2006-01-02 15:04:05")
|
||||||
|
|
||||||
|
for _, a := range affected {
|
||||||
|
var counts []replicaCount
|
||||||
|
countQuery := `SELECT COUNT(*) as count FROM deployment_replicas WHERE deployment_id = ? AND status = 'active'`
|
||||||
|
if err := cm.db.Query(ctx, &counts, countQuery, a.DeploymentID); err != nil {
|
||||||
|
cm.logger.Warn("Failed to count active replicas for deployment",
|
||||||
|
zap.String("deployment_id", a.DeploymentID), zap.Error(err))
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
activeCount := 0
|
||||||
|
if len(counts) > 0 {
|
||||||
|
activeCount = counts[0].Count
|
||||||
|
}
|
||||||
|
|
||||||
|
if activeCount > 0 {
|
||||||
|
// Some replicas still alive — degraded, not dead.
|
||||||
|
statusQuery := `UPDATE deployments SET status = 'degraded' WHERE id = ? AND status = 'active'`
|
||||||
|
cm.db.Exec(ctx, statusQuery, a.DeploymentID)
|
||||||
|
cm.logger.Warn("Deployment degraded — replica on dead node marked failed",
|
||||||
|
zap.String("deployment_id", a.DeploymentID),
|
||||||
|
zap.String("dead_node", deadNodeID),
|
||||||
|
zap.Int("remaining_active", activeCount),
|
||||||
|
)
|
||||||
|
} else {
|
||||||
|
// No replicas alive — deployment is failed.
|
||||||
|
statusQuery := `UPDATE deployments SET status = 'failed' WHERE id = ? AND status IN ('active', 'degraded')`
|
||||||
|
cm.db.Exec(ctx, statusQuery, a.DeploymentID)
|
||||||
|
cm.logger.Error("Deployment failed — all replicas on dead node",
|
||||||
|
zap.String("deployment_id", a.DeploymentID),
|
||||||
|
zap.String("dead_node", deadNodeID),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Log event for audit trail.
|
||||||
|
eventQuery := `INSERT INTO deployment_events (deployment_id, event_type, message, created_at) VALUES (?, 'node_death_replica_failed', ?, ?)`
|
||||||
|
msg := fmt.Sprintf("Replica on node %s marked failed (node confirmed dead), %d active replicas remaining", deadNodeID, activeCount)
|
||||||
|
cm.db.Exec(ctx, eventQuery, a.DeploymentID, msg, now)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
259
pkg/namespace/cluster_recovery_test.go
Normal file
259
pkg/namespace/cluster_recovery_test.go
Normal file
@ -0,0 +1,259 @@
|
|||||||
|
package namespace
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"database/sql"
|
||||||
|
"reflect"
|
||||||
|
"strings"
|
||||||
|
"sync"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/DeBrosOfficial/network/pkg/rqlite"
|
||||||
|
"go.uber.org/zap"
|
||||||
|
)
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Mock DB with callback support for cluster recovery tests
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
// recoveryMockDB implements rqlite.Client with configurable query/exec callbacks.
|
||||||
|
type recoveryMockDB struct {
|
||||||
|
mu sync.Mutex
|
||||||
|
queryFunc func(dest any, query string, args ...any) error
|
||||||
|
execFunc func(query string, args ...any) error
|
||||||
|
queryCalls []mockQueryCall
|
||||||
|
execCalls []mockExecCall
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *recoveryMockDB) Query(_ context.Context, dest any, query string, args ...any) error {
|
||||||
|
m.mu.Lock()
|
||||||
|
ifaceArgs := make([]interface{}, len(args))
|
||||||
|
for i, a := range args {
|
||||||
|
ifaceArgs[i] = a
|
||||||
|
}
|
||||||
|
m.queryCalls = append(m.queryCalls, mockQueryCall{Query: query, Args: ifaceArgs})
|
||||||
|
fn := m.queryFunc
|
||||||
|
m.mu.Unlock()
|
||||||
|
|
||||||
|
if fn != nil {
|
||||||
|
return fn(dest, query, args...)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *recoveryMockDB) Exec(_ context.Context, query string, args ...any) (sql.Result, error) {
|
||||||
|
m.mu.Lock()
|
||||||
|
ifaceArgs := make([]interface{}, len(args))
|
||||||
|
for i, a := range args {
|
||||||
|
ifaceArgs[i] = a
|
||||||
|
}
|
||||||
|
m.execCalls = append(m.execCalls, mockExecCall{Query: query, Args: ifaceArgs})
|
||||||
|
fn := m.execFunc
|
||||||
|
m.mu.Unlock()
|
||||||
|
|
||||||
|
if fn != nil {
|
||||||
|
if err := fn(query, args...); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return mockResult{rowsAffected: 1}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *recoveryMockDB) FindBy(_ context.Context, _ any, _ string, _ map[string]any, _ ...rqlite.FindOption) error {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
func (m *recoveryMockDB) FindOneBy(_ context.Context, _ any, _ string, _ map[string]any, _ ...rqlite.FindOption) error {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
func (m *recoveryMockDB) Save(_ context.Context, _ any) error { return nil }
|
||||||
|
func (m *recoveryMockDB) Remove(_ context.Context, _ any) error { return nil }
|
||||||
|
func (m *recoveryMockDB) Repository(_ string) any { return nil }
|
||||||
|
func (m *recoveryMockDB) CreateQueryBuilder(_ string) *rqlite.QueryBuilder {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
func (m *recoveryMockDB) Tx(_ context.Context, fn func(tx rqlite.Tx) error) error { return nil }
|
||||||
|
|
||||||
|
var _ rqlite.Client = (*recoveryMockDB)(nil)
|
||||||
|
|
||||||
|
func (m *recoveryMockDB) getExecCalls() []mockExecCall {
|
||||||
|
m.mu.Lock()
|
||||||
|
defer m.mu.Unlock()
|
||||||
|
cp := make([]mockExecCall, len(m.execCalls))
|
||||||
|
copy(cp, m.execCalls)
|
||||||
|
return cp
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *recoveryMockDB) getQueryCalls() []mockQueryCall {
|
||||||
|
m.mu.Lock()
|
||||||
|
defer m.mu.Unlock()
|
||||||
|
cp := make([]mockQueryCall, len(m.queryCalls))
|
||||||
|
copy(cp, m.queryCalls)
|
||||||
|
return cp
|
||||||
|
}
|
||||||
|
|
||||||
|
// appendToSlice creates a new element of the slice's element type, sets named
|
||||||
|
// fields using the provided map (keyed by struct field name), and appends it.
|
||||||
|
// This works with locally-defined types whose names are not accessible at compile time.
|
||||||
|
func appendToSlice(dest any, fields map[string]any) {
|
||||||
|
sliceVal := reflect.ValueOf(dest).Elem()
|
||||||
|
elemType := sliceVal.Type().Elem()
|
||||||
|
newElem := reflect.New(elemType).Elem()
|
||||||
|
for name, val := range fields {
|
||||||
|
f := newElem.FieldByName(name)
|
||||||
|
if f.IsValid() && f.CanSet() {
|
||||||
|
f.Set(reflect.ValueOf(val))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
sliceVal.Set(reflect.Append(sliceVal, newElem))
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Tests
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
func TestMarkDeadNodeReplicasFailed_MarksReplicasAndDegradesDeploy(t *testing.T) {
|
||||||
|
// Scenario: node "dead-node" has 1 active replica for deployment "dep-1".
|
||||||
|
// Another replica on a healthy node remains active.
|
||||||
|
// Expected: replica marked failed, deployment set to 'degraded'.
|
||||||
|
db := &recoveryMockDB{}
|
||||||
|
|
||||||
|
db.queryFunc = func(dest any, query string, args ...any) error {
|
||||||
|
if strings.Contains(query, "DISTINCT deployment_id") {
|
||||||
|
appendToSlice(dest, map[string]any{"DeploymentID": "dep-1"})
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
if strings.Contains(query, "COUNT(*)") {
|
||||||
|
// One active replica remaining on a healthy node.
|
||||||
|
appendToSlice(dest, map[string]any{"Count": 1})
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
cm := &ClusterManager{db: db, logger: zap.NewNop()}
|
||||||
|
cm.markDeadNodeReplicasFailed(context.Background(), "dead-node")
|
||||||
|
|
||||||
|
execCalls := db.getExecCalls()
|
||||||
|
|
||||||
|
// Should have: 1 UPDATE replicas + 1 UPDATE deployment status + 1 INSERT event = 3
|
||||||
|
if len(execCalls) != 3 {
|
||||||
|
t.Fatalf("expected 3 exec calls, got %d: %+v", len(execCalls), execCalls)
|
||||||
|
}
|
||||||
|
|
||||||
|
// First exec: mark replicas failed.
|
||||||
|
if !strings.Contains(execCalls[0].Query, "UPDATE deployment_replicas") {
|
||||||
|
t.Errorf("first exec should update replicas, got: %s", execCalls[0].Query)
|
||||||
|
}
|
||||||
|
if execCalls[0].Args[0] != "dead-node" {
|
||||||
|
t.Errorf("expected dead-node arg, got: %v", execCalls[0].Args[0])
|
||||||
|
}
|
||||||
|
|
||||||
|
// Second exec: set deployment to degraded (not failed, since 1 replica remains).
|
||||||
|
if !strings.Contains(execCalls[1].Query, "status = 'degraded'") {
|
||||||
|
t.Errorf("expected degraded status update, got: %s", execCalls[1].Query)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Third exec: deployment event log.
|
||||||
|
if !strings.Contains(execCalls[2].Query, "deployment_events") {
|
||||||
|
t.Errorf("expected event INSERT, got: %s", execCalls[2].Query)
|
||||||
|
}
|
||||||
|
if !strings.Contains(execCalls[2].Args[1].(string), "1 active replicas remaining") {
|
||||||
|
t.Errorf("event message should mention remaining replicas, got: %s", execCalls[2].Args[1])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestMarkDeadNodeReplicasFailed_AllReplicasDead_SetsFailed(t *testing.T) {
|
||||||
|
// Scenario: node "dead-node" has the only replica for "dep-2".
|
||||||
|
// Expected: replica marked failed, deployment set to 'failed'.
|
||||||
|
db := &recoveryMockDB{}
|
||||||
|
|
||||||
|
db.queryFunc = func(dest any, query string, args ...any) error {
|
||||||
|
if strings.Contains(query, "DISTINCT deployment_id") {
|
||||||
|
appendToSlice(dest, map[string]any{"DeploymentID": "dep-2"})
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
if strings.Contains(query, "COUNT(*)") {
|
||||||
|
// Zero active replicas remaining.
|
||||||
|
appendToSlice(dest, map[string]any{"Count": 0})
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
cm := &ClusterManager{db: db, logger: zap.NewNop()}
|
||||||
|
cm.markDeadNodeReplicasFailed(context.Background(), "dead-node")
|
||||||
|
|
||||||
|
execCalls := db.getExecCalls()
|
||||||
|
|
||||||
|
if len(execCalls) != 3 {
|
||||||
|
t.Fatalf("expected 3 exec calls, got %d: %+v", len(execCalls), execCalls)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Second exec: set deployment to failed (not degraded).
|
||||||
|
if !strings.Contains(execCalls[1].Query, "status = 'failed'") {
|
||||||
|
t.Errorf("expected failed status update, got: %s", execCalls[1].Query)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestMarkDeadNodeReplicasFailed_NoReplicas_ReturnsEarly(t *testing.T) {
|
||||||
|
// Scenario: dead node has no deployment replicas.
|
||||||
|
// Expected: no exec calls at all.
|
||||||
|
db := &recoveryMockDB{}
|
||||||
|
|
||||||
|
db.queryFunc = func(dest any, query string, args ...any) error {
|
||||||
|
// Return empty slice for all queries.
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
cm := &ClusterManager{db: db, logger: zap.NewNop()}
|
||||||
|
cm.markDeadNodeReplicasFailed(context.Background(), "dead-node")
|
||||||
|
|
||||||
|
execCalls := db.getExecCalls()
|
||||||
|
if len(execCalls) != 0 {
|
||||||
|
t.Errorf("expected 0 exec calls when no replicas, got %d", len(execCalls))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestMarkDeadNodeReplicasFailed_MultipleDeployments(t *testing.T) {
|
||||||
|
// Scenario: dead node has replicas for 2 deployments.
|
||||||
|
// dep-1: has another healthy replica (degraded).
|
||||||
|
// dep-2: only replica was on dead node (failed).
|
||||||
|
db := &recoveryMockDB{}
|
||||||
|
countCallIdx := 0
|
||||||
|
|
||||||
|
db.queryFunc = func(dest any, query string, args ...any) error {
|
||||||
|
if strings.Contains(query, "DISTINCT deployment_id") {
|
||||||
|
appendToSlice(dest, map[string]any{"DeploymentID": "dep-1"})
|
||||||
|
appendToSlice(dest, map[string]any{"DeploymentID": "dep-2"})
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
if strings.Contains(query, "COUNT(*)") {
|
||||||
|
// First deployment has 1 remaining, second has 0.
|
||||||
|
counts := []int{1, 0}
|
||||||
|
appendToSlice(dest, map[string]any{"Count": counts[countCallIdx]})
|
||||||
|
countCallIdx++
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
cm := &ClusterManager{db: db, logger: zap.NewNop()}
|
||||||
|
cm.markDeadNodeReplicasFailed(context.Background(), "dead-node")
|
||||||
|
|
||||||
|
execCalls := db.getExecCalls()
|
||||||
|
|
||||||
|
// 1 mark-all-failed + 2*(status update + event) = 5
|
||||||
|
if len(execCalls) != 5 {
|
||||||
|
t.Fatalf("expected 5 exec calls, got %d: %+v", len(execCalls), execCalls)
|
||||||
|
}
|
||||||
|
|
||||||
|
// dep-1: degraded
|
||||||
|
if !strings.Contains(execCalls[1].Query, "status = 'degraded'") {
|
||||||
|
t.Errorf("dep-1 should be degraded, got: %s", execCalls[1].Query)
|
||||||
|
}
|
||||||
|
|
||||||
|
// dep-2: failed
|
||||||
|
if !strings.Contains(execCalls[3].Query, "status = 'failed'") {
|
||||||
|
t.Errorf("dep-2 should be failed, got: %s", execCalls[3].Query)
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
x
Reference in New Issue
Block a user