feat: enhance service shutdown and logging in development environment

- Improved the `stop` target in the Makefile to ensure graceful shutdown of development services, allowing for a more reliable process termination.
- Updated the `StopAll` method in the ProcessManager to provide clearer logging during service shutdown, including progress updates and error handling.
- Added a new `PushNotificationService` to handle sending push notifications via Expo, including bulk notification capabilities and improved error handling.
- Refactored RQLite management to streamline node identification and logging, ensuring consistent behavior across node types during startup and recovery.
This commit is contained in:
anonpenguin23 2025-11-21 13:52:55 +02:00
parent 33ebf222ff
commit 4c1f842939
10 changed files with 703 additions and 384 deletions

View File

@ -13,6 +13,42 @@ The format is based on [Keep a Changelog][keepachangelog] and adheres to [Semant
### Deprecated
### Fixed
## [0.69.17] - 2025-11-21
### Added
- Initial implementation of a Push Notification Service for the Gateway, utilizing the Expo API.
- Detailed documentation for RQLite operations, monitoring, and troubleshooting was added to the README.
### Changed
- Improved `make stop` and `dbn dev down` commands to ensure all development services are forcefully killed after graceful shutdown attempt.
- Refactored RQLite startup logic to simplify cluster establishment and remove complex, error-prone leadership/recovery checks, relying on RQLite's built-in join mechanism.
- RQLite logs are now written to individual log files (e.g., `~/.debros/logs/rqlite-bootstrap.log`) instead of stdout/stderr, improving development environment clarity.
- Improved peer exchange discovery logging to suppress expected 'protocols not supported' warnings from lightweight clients like the Gateway.
### Deprecated
### Removed
### Fixed
\n
## [0.69.17] - 2025-11-21
### Added
- Initial implementation of a Push Notification Service for the Gateway, utilizing the Expo API.
- Detailed documentation for RQLite operations, monitoring, and troubleshooting in the README.
### Changed
- Improved `make stop` and `dbn dev down` commands to ensure all development services are forcefully killed after graceful shutdown attempt.
- Refactored RQLite startup logic to simplify cluster establishment and remove complex, error-prone leadership/recovery checks, relying on RQLite's built-in join mechanism.
- RQLite logs are now written to individual log files (e.g., `~/.debros/logs/rqlite-bootstrap.log`) instead of stdout/stderr, improving development environment clarity.
- Improved peer exchange discovery logging to suppress expected 'protocols not supported' warnings from lightweight clients like the Gateway.
### Deprecated
### Removed
### Fixed
\n
## [0.69.16] - 2025-11-16
### Added

View File

@ -19,7 +19,7 @@ test-e2e:
.PHONY: build clean test run-node run-node2 run-node3 run-example deps tidy fmt vet lint clear-ports install-hooks kill
VERSION := 0.69.16
VERSION := 0.69.17
COMMIT ?= $(shell git rev-parse --short HEAD 2>/dev/null || echo unknown)
DATE ?= $(shell date -u +%Y-%m-%dT%H:%M:%SZ)
LDFLAGS := -X 'main.version=$(VERSION)' -X 'main.commit=$(COMMIT)' -X 'main.date=$(DATE)'
@ -88,11 +88,9 @@ dev: build
# Graceful shutdown of all dev services
stop:
@if [ -f ./bin/dbn ]; then \
./bin/dbn dev down; \
else \
echo "⚠️ dbn binary not found, using force kill instead..."; \
bash scripts/dev-kill-all.sh; \
./bin/dbn dev down || true; \
fi
@bash scripts/dev-kill-all.sh
# Force kill all processes (immediate termination)
kill:

335
README.md
View File

@ -590,15 +590,350 @@ Common endpoints (see `openapi/gateway.yaml` for the full spec):
- `POST /v1/pubsub/publish`, `GET /v1/pubsub/topics`, `GET /v1/pubsub/ws?topic=<topic>`
- `POST /v1/storage/upload`, `POST /v1/storage/pin`, `GET /v1/storage/status/:cid`, `GET /v1/storage/get/:cid`, `DELETE /v1/storage/unpin/:cid`
## RQLite Operations & Monitoring
RQLite is the distributed SQL database backing DeBros Network. Proper monitoring and maintenance are critical for cluster health.
### Connecting to RQLite
```bash
# Local development (bootstrap) - port 5001
rqlite -H localhost -p 5001
# Local development (bootstrap2) - port 5011
rqlite -H localhost -p 5011
# Production nodes
rqlite -H 192.168.1.151 -p 5001
```
### Health Checks (CRITICAL for Cluster Health)
```bash
# Check node status and diagnostics
rqlite -H localhost -p 5001 ".status"
# List all nodes in cluster (verify all nodes connected)
rqlite -H localhost -p 5001 ".nodes"
# Check if node is ready for operations
rqlite -H localhost -p 5001 ".ready"
# Get Go runtime info (goroutines, memory, performance)
rqlite -H localhost -p 5001 ".expvar"
# Show all tables
rqlite -H localhost -p 5001 ".tables"
# Show schema (CREATE statements)
rqlite -H localhost -p 5001 ".schema"
# Show all indexes
rqlite -H localhost -p 5001 ".indexes"
```
### Backup & Restore
```bash
# Backup database
rqlite -H localhost -p 5001 ".backup ~/rqlite-backup.db"
# Restore from backup
rqlite -H localhost -p 5001 ".restore ~/rqlite-backup.db"
# Dump database in SQL text format
rqlite -H localhost -p 5001 ".dump ~/rqlite-dump.sql"
```
### Consistency Levels (Important for Data Integrity)
RQLite supports three consistency levels for read operations:
```bash
# View current consistency level
rqlite -H localhost -p 5001 ".consistency"
# Set to weak (default, good balance for most applications)
rqlite -H localhost -p 5001 ".consistency weak"
# Set to strong (guaranteed consistency across entire cluster)
rqlite -H localhost -p 5001 ".consistency strong"
# Set to none (fastest reads, no consistency guarantees)
rqlite -H localhost -p 5001 ".consistency none"
```
**Recommendation**: Use `weak` for general operations, `strong` when data integrity is critical, and `none` only for cache-like data.
### Cluster Management
```bash
# Show detailed cluster diagnostics
rqlite -H localhost -p 5001 ".sysdump /tmp/rqlite-diagnostic.txt"
# Remove a node from cluster (use raft ID from .nodes output)
rqlite -H localhost -p 5001 ".remove <raft_id>"
```
### RQLite Log Files (Development)
All RQLite logs are now written to individual files for easier debugging:
```
~/.debros/logs/rqlite-bootstrap.log
~/.debros/logs/rqlite-bootstrap2.log
~/.debros/logs/rqlite-node2.log
~/.debros/logs/rqlite-node3.log
~/.debros/logs/rqlite-node4.log
```
View logs:
```bash
tail -f ~/.debros/logs/rqlite-bootstrap.log
tail -f ~/.debros/logs/rqlite-node2.log
dbn dev logs rqlite-bootstrap --follow
```
## Development Environment Operations
### Starting & Managing Development Environment
```bash
# Start the complete development stack (2 bootstraps + 3 nodes + gateway)
make dev
# Check status of running services
dbn dev status
# Stop all services
dbn dev down
```
### Development Logs
```bash
# View logs for specific component
dbn dev logs bootstrap
dbn dev logs bootstrap2
dbn dev logs node2
dbn dev logs node3
dbn dev logs node4
dbn dev logs gateway
dbn dev logs olric
dbn dev logs anon
# Follow logs in real-time (like tail -f)
dbn dev logs bootstrap --follow
dbn dev logs rqlite-bootstrap --follow
```
### Key Development Endpoints
```
Gateway: http://localhost:6001
Bootstrap IPFS: http://localhost:4501
Bootstrap2 IPFS: http://localhost:4511
Node2 IPFS: http://localhost:4502
Node3 IPFS: http://localhost:4503
Node4 IPFS: http://localhost:4504
Anon SOCKS: 127.0.0.1:9050
Olric Cache: http://localhost:3320
RQLite Bootstrap: http://localhost:5001
RQLite Bootstrap2: http://localhost:5011
RQLite Node2: http://localhost:5002
RQLite Node3: http://localhost:5003
RQLite Node4: http://localhost:5004
```
## IPFS Configuration
### Ensure Consistent Cluster Setup
All nodes in a cluster must have identical `cluster.secret` and `swarm.key`:
```bash
# Copy swarm key to each host (adjust path for bootstrap vs node):
# Bootstrap node
sudo cp /home/debros/.debros/secrets/swarm.key /home/debros/.debros/data/bootstrap/ipfs/repo/swarm.key
# Regular nodes
sudo cp /home/debros/.debros/secrets/swarm.key /home/debros/.debros/data/node/ipfs/repo/swarm.key
# Fix permissions
sudo chown debros:debros /home/debros/.debros/data/*/ipfs/repo/swarm.key
sudo chmod 600 /home/debros/.debros/data/*/ipfs/repo/swarm.key
```
### Important IPFS Configuration Notes
- **Production**: Update Olric config - change `0.0.0.0` to actual IP address for both entries
- **All Nodes**: Must have identical `cluster.secret` and `swarm.key` for cluster to form
## Troubleshooting
### General Issues
- **Config directory errors**: Ensure `~/.debros/` exists, is writable, and has free disk space (`touch ~/.debros/test && rm ~/.debros/test`).
- **Port conflicts**: Inspect with `lsof -i :4001` (or other ports) and stop conflicting processes or regenerate configs with new ports.
- **Missing configs**: Run `./bin/dbn config init` before starting nodes.
- **Cluster join issues**: Confirm the bootstrap node is running, `peer.info` multiaddr matches `bootstrap_peers`, and firewall rules allow the P2P ports.
### RQLite Troubleshooting
#### Cluster Not Forming
```bash
# Verify all nodes see each other
rqlite -H localhost -p 5001 ".nodes"
# Check node readiness
rqlite -H localhost -p 5001 ".ready"
# Check status and Raft logs
rqlite -H localhost -p 5001 ".status"
```
#### Broken RQLite Raft (Production)
```bash
# Fix RQLite Raft consensus
sudo env HOME=/home/debros network-cli rqlite fix
```
#### Reset RQLite State (DESTRUCTIVE - Last Resort Only)
```bash
# ⚠️ WARNING: This destroys all RQLite data!
rm -f ~/.debros/data/rqlite/raft.db
rm -f ~/.debros/data/rqlite/raft/peers.json
```
#### Kill IPFS Cluster Service
```bash
pkill -f ipfs-cluster-service
```
### Services Not Starting
```bash
# Check service status
systemctl status debros-node-bootstrap
# View detailed logs
journalctl -u debros-node-bootstrap -n 100
# Check log files
tail -f /home/debros/.debros/logs/node-bootstrap.log
```
### Port Conflicts
```bash
# Check what's using specific ports
sudo lsof -i :4001 # P2P port
sudo lsof -i :5001 # RQLite HTTP
sudo lsof -i :6001 # Gateway
sudo lsof -i :9094 # IPFS Cluster API
# Kill all DeBros-related processes (except Anyone on 9050)
lsof -ti:7001,7002,7003,5001,5002,5003,6001,4001,3320,3322,9094 | xargs kill -9 2>/dev/null && echo "Killed processes" || echo "No processes found"
```
### Systemd Service Management
```bash
# Stop all services (keeps Anyone proxy running on 9050)
sudo systemctl stop debros-*
# Disable services from auto-start
sudo systemctl disable debros-*
# Restart all services
sudo systemctl restart debros-*
# Enable services for auto-start on boot
sudo systemctl enable debros-*
# View all DeBros services
systemctl list-units 'debros-*'
# Clean up failed services
sudo systemctl reset-failed
```
### Reset Installation (⚠️ Destroys All Data)
```bash
# Start fresh (production)
sudo dbn prod uninstall
sudo rm -rf /home/debros/.debros
sudo dbn prod install --bootstrap --branch nightly
```
## Operations Cheat Sheet
### User Management (Linux)
```bash
# Switch to DeBros user
sudo -u debros bash
# Kill all DeBros user processes
sudo killall -9 -u debros
# Remove DeBros user completely
sudo userdel -r -f debros
```
### Installation & Deployment
```bash
# Local development
make dev
# Install nightly branch
wget https://raw.githubusercontent.com/DeBrosOfficial/network/refs/heads/nightly/scripts/install-debros-network.sh
chmod +x ./install-debros-network.sh
./install-debros-network.sh --prerelease --nightly
# Production bootstrap node
sudo dbn prod install --bootstrap --branch nightly
# Production secondary node
sudo dbn prod install \
--vps-ip <your_ip> \
--peers /ip4/<bootstrap_ip>/tcp/4001/p2p/<peer_id> \
--branch nightly
```
### Configuration & Sudoers (Deploy User)
```bash
# Add to sudoers for deploy automation
ubuntu ALL=(ALL) NOPASSWD: /bin/bash
ubuntu ALL=(ALL) NOPASSWD: /usr/bin/make
# Git configuration
git config --global --add safe.directory /home/debros/src
```
### Authentication
```bash
# Login to gateway
env DEBROS_GATEWAY_URL=https://node-kv4la8.debros.network dbn auth login
```
## Resources
- [RQLite CLI Documentation](https://rqlite.io/docs/cli/)
- [RQLite Features](https://rqlite.io/docs/features/)
- [RQLite Clustering Guide](https://rqlite.io/docs/clustering/)
- [RQLite Security](https://rqlite.io/docs/security/)
- [RQLite Backup & Restore](https://rqlite.io/docs/backup-and-restore/)
- Go modules: `go mod tidy`, `go test ./...`
- Automation: `make build`, `make dev`, `make run-gateway`, `make lint`
- API reference: `openapi/gateway.yaml`

View File

@ -136,7 +136,10 @@ func handleDevDown(args []string) {
if err := pm.StopAll(ctx); err != nil {
fmt.Fprintf(os.Stderr, "⚠️ Error stopping services: %v\n", err)
os.Exit(1)
}
fmt.Printf("✅ All services have been stopped\n\n")
}
func handleDevStatus(args []string) {

View File

@ -6,6 +6,7 @@ import (
"errors"
"io"
"strconv"
"strings"
"time"
"github.com/libp2p/go-libp2p/core/host"
@ -420,11 +421,20 @@ func (d *Manager) requestPeersFromPeer(ctx context.Context, peerID peer.ID, limi
// Open a stream to the peer
stream, err := d.host.NewStream(ctx, peerID, PeerExchangeProtocol)
if err != nil {
// Suppress repeated warnings for the same peer (log once per minute max)
// Check if this is a "protocols not supported" error (expected for lightweight clients like gateway)
if strings.Contains(err.Error(), "protocols not supported") {
// This is a lightweight client (gateway, etc.) that doesn't support peer exchange - expected behavior
// Track it to avoid repeated attempts, but don't log as it's not an error
d.failedPeerExchanges[peerID] = time.Now()
return nil
}
// For actual connection errors, log but suppress repeated warnings for the same peer
lastFailure, seen := d.failedPeerExchanges[peerID]
if !seen || time.Since(lastFailure) > time.Minute {
d.logger.Debug("Failed to open peer exchange stream",
d.logger.Debug("Failed to open peer exchange stream with node",
zap.String("peer_id", peerID.String()[:8]+"..."),
zap.String("reason", "peer does not support peer exchange protocol or connection failed"),
zap.Error(err))
d.failedPeerExchanges[peerID] = time.Now()
}

View File

@ -95,7 +95,7 @@ func (pm *ProcessManager) StartAll(ctx context.Context) error {
// StopAll stops all running processes
func (pm *ProcessManager) StopAll(ctx context.Context) error {
fmt.Fprintf(pm.logWriter, "\n🛑 Stopping development environment...\n")
fmt.Fprintf(pm.logWriter, "\n🛑 Stopping development environment...\n\n")
topology := DefaultTopology()
var services []string
@ -116,11 +116,22 @@ func (pm *ProcessManager) StopAll(ctx context.Context) error {
}
services = append(services, "olric", "anon")
fmt.Fprintf(pm.logWriter, "Stopping %d services...\n\n", len(services))
// Stop all processes sequentially (in dependency order) and wait for each
stoppedCount := 0
for _, svc := range services {
pm.stopProcess(svc)
if err := pm.stopProcess(svc); err != nil {
fmt.Fprintf(pm.logWriter, "⚠️ Error stopping %s: %v\n", svc, err)
} else {
stoppedCount++
}
// Show progress
fmt.Fprintf(pm.logWriter, " [%d/%d] stopped\n", stoppedCount, len(services))
}
fmt.Fprintf(pm.logWriter, "✓ All services stopped\n\n")
fmt.Fprintf(pm.logWriter, "\n✅ All %d services have been stopped\n\n", stoppedCount)
return nil
}

View File

@ -0,0 +1,184 @@
package gateway
import (
"bytes"
"context"
"encoding/json"
"fmt"
"io"
"net/http"
"time"
"go.uber.org/zap"
)
// PushNotificationService handles sending push notifications via Expo
type PushNotificationService struct {
logger *zap.Logger
client *http.Client
}
// ExpoTicket represents the response from Expo API
type ExpoTicket struct {
ID string `json:"id"`
Error string `json:"error,omitempty"`
}
// ExpoPushMessage represents a message to send via Expo
type ExpoPushMessage struct {
To string `json:"to"`
Title string `json:"title"`
Body string `json:"body"`
Data map[string]interface{} `json:"data,omitempty"`
Sound string `json:"sound,omitempty"`
Badge int `json:"badge,omitempty"`
Priority string `json:"priority,omitempty"`
// iOS specific
MutableContent bool `json:"mutableContent,omitempty"`
IosIcon string `json:"iosIcon,omitempty"`
// Android specific
AndroidBigLargeIcon string `json:"androidBigLargeIcon,omitempty"`
ChannelID string `json:"channelId,omitempty"`
}
// NewPushNotificationService creates a new push notification service
func NewPushNotificationService(logger *zap.Logger) *PushNotificationService {
return &PushNotificationService{
logger: logger,
client: &http.Client{
Timeout: 10 * time.Second,
},
}
}
// SendNotification sends a push notification via Expo
func (pns *PushNotificationService) SendNotification(
ctx context.Context,
expoPushToken string,
title string,
body string,
data map[string]interface{},
avatarURL string,
) error {
if expoPushToken == "" {
return fmt.Errorf("empty expo push token")
}
message := ExpoPushMessage{
To: expoPushToken,
Title: title,
Body: body,
Data: data,
Sound: "default",
Priority: "high",
// Enable mutable content for iOS to allow Notification Service Extension
MutableContent: true,
ChannelID: "messages",
AndroidBigLargeIcon: avatarURL,
}
// For iOS, include avatar in data so Notification Service Extension can fetch it
if avatarURL != "" {
if message.Data == nil {
message.Data = make(map[string]interface{})
}
message.Data["avatar_url"] = avatarURL
}
return pns.sendExpoRequest(ctx, message)
}
// SendBulkNotifications sends notifications to multiple users
func (pns *PushNotificationService) SendBulkNotifications(
ctx context.Context,
expoPushTokens []string,
title string,
body string,
data map[string]interface{},
avatarURL string,
) []error {
errors := make([]error, 0)
for _, token := range expoPushTokens {
if err := pns.SendNotification(ctx, token, title, body, data, avatarURL); err != nil {
errors = append(errors, fmt.Errorf("failed to send to token %s: %w", token, err))
}
}
return errors
}
// sendExpoRequest sends a request to the Expo push notification API
func (pns *PushNotificationService) sendExpoRequest(ctx context.Context, message ExpoPushMessage) error {
const expoAPIURL = "https://exp.host/--/api/v2/push/send"
body, err := json.Marshal(message)
if err != nil {
pns.logger.Error("failed to marshal push notification",
zap.Error(err),
zap.String("to", message.To))
return fmt.Errorf("marshal error: %w", err)
}
req, err := http.NewRequestWithContext(ctx, http.MethodPost, expoAPIURL, bytes.NewBuffer(body))
if err != nil {
pns.logger.Error("failed to create push notification request",
zap.Error(err),
zap.String("to", message.To))
return fmt.Errorf("request creation error: %w", err)
}
req.Header.Set("Content-Type", "application/json")
resp, err := pns.client.Do(req)
if err != nil {
pns.logger.Error("failed to send push notification",
zap.Error(err),
zap.String("to", message.To))
return fmt.Errorf("send error: %w", err)
}
defer resp.Body.Close()
respBody, err := io.ReadAll(resp.Body)
if err != nil {
pns.logger.Error("failed to read push notification response",
zap.Error(err),
zap.String("to", message.To))
return fmt.Errorf("response read error: %w", err)
}
// Check for API errors
if resp.StatusCode != http.StatusOK {
pns.logger.Warn("push notification API error",
zap.Int("status_code", resp.StatusCode),
zap.String("response", string(respBody)),
zap.String("to", message.To))
return fmt.Errorf("API returned status %d: %s", resp.StatusCode, string(respBody))
}
// Parse response
var tickets []ExpoTicket
if err := json.Unmarshal(respBody, &tickets); err != nil {
pns.logger.Error("failed to parse push notification response",
zap.Error(err),
zap.String("response", string(respBody)))
return fmt.Errorf("parse error: %w", err)
}
// Check for errors in tickets
for _, ticket := range tickets {
if ticket.Error != "" {
pns.logger.Warn("push notification error in ticket",
zap.String("error", ticket.Error),
zap.String("to", message.To))
return fmt.Errorf("ticket error: %s", ticket.Error)
}
}
pns.logger.Info("push notification sent successfully",
zap.String("to", message.To),
zap.String("title", message.Title))
return nil
}

View File

@ -70,15 +70,26 @@ func NewNode(cfg *config.Config) (*Node, error) {
func (n *Node) startRQLite(ctx context.Context) error {
n.logger.Info("Starting RQLite database")
// Determine node identifier for log filename - use node ID for unique filenames
nodeID := n.config.Node.ID
if nodeID == "" {
// Fallback to type if ID is not set
nodeID = n.config.Node.Type
if nodeID == "" {
nodeID = "node"
}
}
// Create RQLite manager
n.rqliteManager = database.NewRQLiteManager(&n.config.Database, &n.config.Discovery, n.config.Node.DataDir, n.logger.Logger)
n.rqliteManager.SetNodeType(nodeID)
// Initialize cluster discovery service if LibP2P host is available
if n.host != nil && n.discoveryManager != nil {
// Determine node type
nodeType := "node"
// Determine node type for cluster discovery (bootstrap or node)
discoveryNodeType := "node"
if n.config.Node.Type == "bootstrap" {
nodeType = "bootstrap"
discoveryNodeType = "bootstrap"
}
// Create cluster discovery service
@ -87,7 +98,7 @@ func (n *Node) startRQLite(ctx context.Context) error {
n.discoveryManager,
n.rqliteManager,
n.config.Node.ID,
nodeType,
discoveryNodeType,
n.config.Discovery.RaftAdvAddress,
n.config.Discovery.HttpAdvAddress,
n.config.Node.DataDir,

View File

@ -633,11 +633,7 @@ func (c *ClusterDiscoveryService) WaitForDiscoverySettling(ctx context.Context)
// TriggerSync manually triggers a cluster membership sync
func (c *ClusterDiscoveryService) TriggerSync() {
// For bootstrap nodes, wait a bit for peer discovery to stabilize
if c.nodeType == "bootstrap" {
time.Sleep(5 * time.Second)
}
// All nodes use the same discovery timing for consistency
c.updateClusterMembership()
}

View File

@ -10,7 +10,6 @@ import (
"os"
"os/exec"
"path/filepath"
"strconv"
"strings"
"syscall"
"time"
@ -26,6 +25,7 @@ type RQLiteManager struct {
config *config.DatabaseConfig
discoverConfig *config.DiscoveryConfig
dataDir string
nodeType string // "bootstrap" or "node"
logger *zap.Logger
cmd *exec.Cmd
connection *gorqlite.Connection
@ -81,6 +81,13 @@ func (r *RQLiteManager) SetDiscoveryService(service *ClusterDiscoveryService) {
r.discoveryService = service
}
// SetNodeType sets the node type for this RQLite manager ("bootstrap" or "node")
func (r *RQLiteManager) SetNodeType(nodeType string) {
if nodeType != "" {
r.nodeType = nodeType
}
}
// UpdateAdvertisedAddresses overrides the discovery advertised addresses when cluster discovery
// infers a better host than what was provided via configuration (e.g. replacing localhost).
func (r *RQLiteManager) UpdateAdvertisedAddresses(raftAddr, httpAddr string) {
@ -233,7 +240,7 @@ func (r *RQLiteManager) launchProcess(ctx context.Context, rqliteDataDir string)
"-raft-addr", fmt.Sprintf("0.0.0.0:%d", r.config.RQLiteRaftPort),
}
// Add join address if specified (for non-bootstrap or secondary bootstrap nodes)
// All nodes follow the same join logic - either join specified address or start as single-node cluster
if r.config.RQLiteJoinAddress != "" {
r.logger.Info("Joining RQLite cluster", zap.String("join_address", r.config.RQLiteJoinAddress))
@ -259,28 +266,9 @@ func (r *RQLiteManager) launchProcess(ctx context.Context, rqliteDataDir string)
// Add retry parameters to handle slow cluster startup (e.g., during recovery)
args = append(args, "-join", joinArg, "-join-attempts", "30", "-join-interval", "10s")
} else {
r.logger.Info("No join address specified - starting as new cluster")
// For bootstrap nodes, use bootstrap-expect if we know about other peers
if r.discoveryService != nil {
allPeers := r.discoveryService.GetAllPeers()
remotePeerCount := 0
for _, peer := range allPeers {
if peer.NodeID != r.discoverConfig.RaftAdvAddress {
remotePeerCount++
}
}
// Use bootstrap-expect if we have discovered enough peers
// This tells RQLite to wait for the expected number of nodes before forming cluster
if remotePeerCount >= (r.config.MinClusterSize - 1) {
expectedPeers := r.config.MinClusterSize
args = append(args, "-bootstrap-expect", strconv.Itoa(expectedPeers))
r.logger.Info("Using bootstrap-expect to wait for cluster formation",
zap.Int("expected_peers", expectedPeers),
zap.Int("remote_peers_discovered", remotePeerCount))
}
}
r.logger.Info("No join address specified - starting as single-node cluster")
// When no join address is provided, rqlited will start as a single-node cluster
// This is expected for the first node in a fresh cluster
}
// Add data directory as positional argument
@ -295,14 +283,41 @@ func (r *RQLiteManager) launchProcess(ctx context.Context, rqliteDataDir string)
// Start RQLite process (not bound to ctx for graceful Stop handling)
r.cmd = exec.Command("rqlited", args...)
// Enable debug logging of RQLite process to help diagnose issues
r.cmd.Stdout = os.Stdout
r.cmd.Stderr = os.Stderr
// Setup log file for RQLite output
// Determine node type for log filename
nodeType := r.nodeType
if nodeType == "" {
nodeType = "node"
}
// Create logs directory
logsDir := filepath.Join(filepath.Dir(r.dataDir), "logs")
if err := os.MkdirAll(logsDir, 0755); err != nil {
return fmt.Errorf("failed to create logs directory at %s: %w", logsDir, err)
}
// Open log file for RQLite output
logPath := filepath.Join(logsDir, fmt.Sprintf("rqlite-%s.log", nodeType))
logFile, err := os.OpenFile(logPath, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0644)
if err != nil {
return fmt.Errorf("failed to open RQLite log file at %s: %w", logPath, err)
}
r.logger.Info("RQLite logs will be written to file",
zap.String("path", logPath))
r.cmd.Stdout = logFile
r.cmd.Stderr = logFile
if err := r.cmd.Start(); err != nil {
logFile.Close()
return fmt.Errorf("failed to start RQLite: %w", err)
}
// Close the log file handle after process starts (the subprocess maintains its own reference)
// This allows the file to be rotated or inspected while the process is running
logFile.Close()
return nil
}
@ -337,19 +352,18 @@ func (r *RQLiteManager) waitForReadyAndConnect(ctx context.Context) error {
// Check if error is "store is not open" (recovery scenario)
if strings.Contains(err.Error(), "store is not open") {
if attempt < maxConnectAttempts-1 {
// Only retry for joining nodes; bootstrap nodes should fail fast
if r.config.RQLiteJoinAddress != "" {
if attempt%3 == 0 {
r.logger.Debug("RQLite store not yet accessible for connection, retrying...",
zap.Int("attempt", attempt+1), zap.Error(err))
}
time.Sleep(connectBackoff)
connectBackoff = time.Duration(float64(connectBackoff) * 1.5)
if connectBackoff > 5*time.Second {
connectBackoff = 5 * time.Second
}
continue
// Retry with exponential backoff for all nodes during recovery
// The store may not open immediately, especially during cluster recovery
if attempt%3 == 0 {
r.logger.Debug("RQLite store not yet accessible for connection, retrying...",
zap.Int("attempt", attempt+1), zap.Error(err))
}
time.Sleep(connectBackoff)
connectBackoff = time.Duration(float64(connectBackoff) * 1.5)
if connectBackoff > 5*time.Second {
connectBackoff = 5 * time.Second
}
continue
}
}
@ -376,130 +390,52 @@ func (r *RQLiteManager) waitForReadyAndConnect(ctx context.Context) error {
return nil
}
// establishLeadershipOrJoin establishes leadership (bootstrap) or waits for SQL availability (joining)
// establishLeadershipOrJoin handles post-startup cluster establishment
// All nodes follow the same pattern: wait for SQL availability
// For nodes without a join address, RQLite automatically forms a single-node cluster and becomes leader
func (r *RQLiteManager) establishLeadershipOrJoin(ctx context.Context, rqliteDataDir string) error {
if r.config.RQLiteJoinAddress == "" {
// Bootstrap node logic with data safety checks
r.logger.Info("Bootstrap node: checking if safe to lead")
// First node - no join address specified
// RQLite will automatically form a single-node cluster and become leader
r.logger.Info("Starting as first node in cluster")
// SAFETY: Check if we can safely become leader
canLead, err := r.canSafelyBecomeLeader()
if !canLead && err != nil {
r.logger.Warn("Not safe to become leader, attempting to join existing cluster",
zap.Error(err))
// Find node with highest log index and join it
if r.discoveryService != nil {
targetNode := r.discoveryService.GetNodeWithHighestLogIndex()
if targetNode != nil {
r.logger.Info("Joining node with higher data",
zap.String("target_node", targetNode.NodeID),
zap.String("raft_address", targetNode.RaftAddress),
zap.Uint64("their_index", targetNode.RaftLogIndex))
return r.joinExistingCluster(ctx, targetNode.RaftAddress)
}
}
}
// Safe to lead - attempt leadership
leadershipErr := r.waitForLeadership(ctx)
if leadershipErr == nil {
r.logger.Info("Bootstrap node successfully established leadership")
return nil
}
r.logger.Warn("Initial leadership attempt failed, may need cluster recovery",
zap.Error(leadershipErr))
// Try recovery if we have peers.json from discovery
if r.discoveryService != nil {
peersPath := filepath.Join(rqliteDataDir, "raft", "peers.json")
if _, err := os.Stat(peersPath); err == nil {
r.logger.Info("Attempting cluster recovery using peers.json",
zap.String("peers_file", peersPath))
if recoveryErr := r.recoverCluster(ctx, peersPath); recoveryErr == nil {
r.logger.Info("Cluster recovery successful, retrying leadership")
leadershipErr = r.waitForLeadership(ctx)
if leadershipErr == nil {
r.logger.Info("Bootstrap node established leadership after recovery")
return nil
}
} else {
r.logger.Warn("Cluster recovery failed", zap.Error(recoveryErr))
}
}
// Check if we're stuck in configuration mismatch after recovery failed
if leadershipErr != nil && r.isStuckInConfigurationMismatch() {
r.logger.Warn("Detected persistent configuration mismatch, attempting automatic recovery")
// Verify it's safe to clear state (peers have higher log indexes)
if r.isSafeToClearState(rqliteDataDir) {
r.logger.Info("Clearing stale Raft state to resolve configuration mismatch")
if err := r.clearRaftState(rqliteDataDir); err != nil {
r.logger.Error("Failed to clear Raft state", zap.Error(err))
} else {
// Force write peers.json after clearing state
if r.discoveryService != nil {
r.logger.Info("Force writing peers.json after clearing state for configuration mismatch recovery")
if err := r.discoveryService.ForceWritePeersJSON(); err != nil {
r.logger.Error("Failed to force write peers.json", zap.Error(err))
}
// Update peersPath after force write
peersPath = filepath.Join(rqliteDataDir, "raft", "peers.json")
}
// Restart RQLite with clean state
r.logger.Info("Raft state cleared, restarting RQLite for clean rejoin")
if recoveryErr := r.recoverCluster(ctx, peersPath); recoveryErr == nil {
// Retry leadership after state clear
leadershipErr = r.waitForLeadership(ctx)
if leadershipErr == nil {
r.logger.Info("Bootstrap node established leadership after state clear")
return nil
}
}
}
} else {
r.logger.Warn("Configuration mismatch detected but clearing state is unsafe",
zap.String("reason", "peers may not have more recent data"),
zap.String("action", "manual intervention may be required"))
}
}
}
// Final fallback: SQL availability
r.logger.Warn("Leadership failed, trying SQL availability")
// Wait for SQL to be available (indicates RQLite cluster is ready)
sqlCtx := ctx
if _, hasDeadline := ctx.Deadline(); !hasDeadline {
var cancel context.CancelFunc
sqlCtx, cancel = context.WithTimeout(context.Background(), 2*time.Minute)
defer cancel()
}
if err := r.waitForSQLAvailable(sqlCtx); err != nil {
if r.cmd != nil && r.cmd.Process != nil {
_ = r.cmd.Process.Kill()
}
return fmt.Errorf("RQLite SQL not available: %w", err)
}
return nil
} else {
// Joining node logic
r.logger.Info("Waiting for RQLite SQL availability (leader discovery)")
sqlCtx := ctx
if _, hasDeadline := ctx.Deadline(); !hasDeadline {
var cancel context.CancelFunc
sqlCtx, cancel = context.WithTimeout(context.Background(), 5*time.Minute)
defer cancel()
}
if err := r.waitForSQLAvailable(sqlCtx); err != nil {
if r.cmd != nil && r.cmd.Process != nil {
_ = r.cmd.Process.Kill()
}
return fmt.Errorf("RQLite SQL not available: %w", err)
return fmt.Errorf("SQL not available for first node: %w", err)
}
r.logger.Info("First node established successfully")
return nil
}
// Joining node - wait for SQL availability (indicates it joined the leader)
r.logger.Info("Waiting for RQLite SQL availability (joining cluster)")
sqlCtx := ctx
if _, hasDeadline := ctx.Deadline(); !hasDeadline {
var cancel context.CancelFunc
sqlCtx, cancel = context.WithTimeout(context.Background(), 5*time.Minute)
defer cancel()
}
if err := r.waitForSQLAvailable(sqlCtx); err != nil {
if r.cmd != nil && r.cmd.Process != nil {
_ = r.cmd.Process.Kill()
}
return fmt.Errorf("RQLite SQL not available: %w", err)
}
r.logger.Info("Node successfully joined cluster")
return nil
}
// hasExistingState returns true if the rqlite data directory already contains files or subdirectories.
@ -526,16 +462,9 @@ func (r *RQLiteManager) waitForReady(ctx context.Context) error {
url := fmt.Sprintf("http://localhost:%d/status", r.config.RQLitePort)
client := &http.Client{Timeout: 2 * time.Second}
// Determine timeout based on whether this is a joining node
// Joining nodes in recovery may take longer to open the store
var maxAttempts int
if r.config.RQLiteJoinAddress != "" {
// Joining node: allow up to 180 seconds (3 minutes) for recovery
maxAttempts = 180
} else {
// Bootstrap node: allow 30 seconds
maxAttempts = 30
}
// All nodes may need time to open the store during recovery
// Use consistent timeout for cluster consistency
maxAttempts := 180 // 180 seconds (3 minutes) for all nodes
for i := 0; i < maxAttempts; i++ {
select {
@ -589,46 +518,6 @@ func (r *RQLiteManager) waitForReady(ctx context.Context) error {
}
// waitForLeadership waits for RQLite to establish leadership (for bootstrap nodes)
func (r *RQLiteManager) waitForLeadership(ctx context.Context) error {
r.logger.Info("Waiting for RQLite to establish leadership...")
maxAttempts := 30
attempt := 0
backoffDelay := 500 * time.Millisecond
maxBackoff := 5 * time.Second
for attempt < maxAttempts {
select {
case <-ctx.Done():
return ctx.Err()
default:
}
// Try a simple query to check if leadership is established
if r.connection != nil {
_, err := r.connection.QueryOne("SELECT 1")
if err == nil {
r.logger.Info("RQLite leadership established")
return nil
}
// Log every 5th attempt or on first attempt to reduce noise
if attempt%5 == 0 || attempt == 0 {
r.logger.Debug("Waiting for leadership", zap.Int("attempt", attempt+1), zap.Error(err))
}
}
// Exponential backoff with jitter
time.Sleep(backoffDelay)
backoffDelay = time.Duration(float64(backoffDelay) * 1.5)
if backoffDelay > maxBackoff {
backoffDelay = maxBackoff
}
attempt++
}
return fmt.Errorf("RQLite failed to establish leadership within timeout")
}
// GetConnection returns the RQLite connection
func (r *RQLiteManager) GetConnection() *gorqlite.Connection {
return r.connection
@ -819,69 +708,6 @@ func (r *RQLiteManager) testJoinAddress(joinAddress string) error {
return nil
}
// canSafelyBecomeLeader checks if this node can safely become leader without causing data loss
func (r *RQLiteManager) canSafelyBecomeLeader() (bool, error) {
// Get our current Raft log index
ourLogIndex := r.getRaftLogIndex()
// If no discovery service, assume it's safe (backward compatibility)
if r.discoveryService == nil {
r.logger.Debug("No discovery service, assuming safe to lead")
return true, nil
}
// Query discovery service for other nodes
otherNodes := r.discoveryService.GetActivePeers()
if len(otherNodes) == 0 {
// No other nodes - safe to bootstrap
r.logger.Debug("No other nodes discovered, safe to lead",
zap.Uint64("our_log_index", ourLogIndex))
return true, nil
}
// Check if any other node has higher log index
for _, peer := range otherNodes {
if peer.RaftLogIndex > ourLogIndex {
// Other node has more data - we should join them
return false, fmt.Errorf(
"node %s has higher log index (%d > %d), should join as follower",
peer.NodeID, peer.RaftLogIndex, ourLogIndex)
}
}
// We have most recent data or equal - safe to lead
r.logger.Info("Safe to lead - we have most recent data",
zap.Uint64("our_log_index", ourLogIndex),
zap.Int("other_nodes_checked", len(otherNodes)))
return true, nil
}
// joinExistingCluster attempts to join an existing cluster as a follower
func (r *RQLiteManager) joinExistingCluster(ctx context.Context, raftAddress string) error {
r.logger.Info("Attempting to join existing cluster",
zap.String("target_raft_address", raftAddress))
// Wait for the target to be reachable
if err := r.waitForJoinTarget(ctx, raftAddress, 2*time.Minute); err != nil {
return fmt.Errorf("join target not reachable: %w", err)
}
// Wait for SQL availability (the target should have a leader)
sqlCtx := ctx
if _, hasDeadline := ctx.Deadline(); !hasDeadline {
var cancel context.CancelFunc
sqlCtx, cancel = context.WithTimeout(context.Background(), 5*time.Minute)
defer cancel()
}
if err := r.waitForSQLAvailable(sqlCtx); err != nil {
return fmt.Errorf("failed to join cluster - SQL not available: %w", err)
}
r.logger.Info("Successfully joined existing cluster")
return nil
}
// exponentialBackoff calculates exponential backoff duration with jitter
func (r *RQLiteManager) exponentialBackoff(attempt int, baseDelay time.Duration, maxDelay time.Duration) time.Duration {
@ -1038,111 +864,6 @@ func (r *RQLiteManager) clearRaftState(rqliteDataDir string) error {
return nil
}
// isStuckInConfigurationMismatch checks if we're stuck due to configuration mismatch
// This detects both configuration mismatch AND split-brain scenarios
func (r *RQLiteManager) isStuckInConfigurationMismatch() bool {
// First check for split-brain (all followers, term 0, no peers)
if r.isInSplitBrainState() {
return true
}
// Then check for traditional configuration mismatch
status, err := r.getRQLiteStatus()
if err != nil {
r.logger.Debug("Cannot check Raft status for configuration mismatch", zap.Error(err))
return false // Can't determine, don't clear
}
raftState := strings.ToLower(status.Store.Raft.State)
hasLeader := status.Store.Raft.LeaderAddr != ""
// Stuck if: no leader AND state is not "leader" or "follower"
if !hasLeader && raftState != "leader" && raftState != "follower" {
if r.allPeersAreStuck() {
return true
}
}
return false
}
// allPeersAreStuck checks if all discovered peers also report no leader
// This helps confirm we're in a cluster-wide configuration mismatch, not just a local issue
func (r *RQLiteManager) allPeersAreStuck() bool {
if r.discoveryService == nil {
r.logger.Debug("No discovery service available to check peer status")
return false
}
peers := r.discoveryService.GetActivePeers()
if len(peers) == 0 {
r.logger.Debug("No peers discovered, might be network issue")
return false // No peers discovered, might be network issue
}
// Check if we can query peers and they all report no leader
stuckCount := 0
reachableCount := 0
for _, peer := range peers {
if r.peerHasLeader(peer.HTTPAddress) {
// Peer has a leader, so we're not in cluster-wide mismatch
return false
}
// Check if peer is at least reachable
if r.isPeerReachable(peer.HTTPAddress) {
reachableCount++
stuckCount++
}
}
// If we have reachable peers and they're all stuck, we're likely in cluster-wide config mismatch
if reachableCount > 0 && stuckCount == reachableCount {
r.logger.Debug("All reachable peers are also stuck",
zap.Int("reachable_peers", reachableCount),
zap.Int("total_peers", len(peers)))
return true
}
return false
}
// peerHasLeader checks if a peer has a leader by querying its status endpoint
func (r *RQLiteManager) peerHasLeader(httpAddr string) bool {
url := fmt.Sprintf("http://%s/status", httpAddr)
client := &http.Client{Timeout: 3 * time.Second}
resp, err := client.Get(url)
if err != nil {
return false // Can't reach peer
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return false
}
var status RQLiteStatus
if err := json.NewDecoder(resp.Body).Decode(&status); err != nil {
return false
}
// Peer has leader if leader address is set
return status.Store.Raft.LeaderAddr != ""
}
// isPeerReachable checks if a peer is at least responding to HTTP requests
func (r *RQLiteManager) isPeerReachable(httpAddr string) bool {
url := fmt.Sprintf("http://%s/status", httpAddr)
client := &http.Client{Timeout: 3 * time.Second}
resp, err := client.Get(url)
if err != nil {
return false
}
defer resp.Body.Close()
return resp.StatusCode == http.StatusOK
}
// isInSplitBrainState detects if we're in a split-brain scenario where all nodes
// are followers with no peers (each node thinks it's alone)
@ -1215,6 +936,20 @@ func (r *RQLiteManager) isInSplitBrainState() bool {
return false
}
// isPeerReachable checks if a peer is at least responding to HTTP requests
func (r *RQLiteManager) isPeerReachable(httpAddr string) bool {
url := fmt.Sprintf("http://%s/status", httpAddr)
client := &http.Client{Timeout: 3 * time.Second}
resp, err := client.Get(url)
if err != nil {
return false
}
defer resp.Body.Close()
return resp.StatusCode == http.StatusOK
}
// getPeerRQLiteStatus queries a peer's status endpoint
func (r *RQLiteManager) getPeerRQLiteStatus(httpAddr string) (*RQLiteStatus, error) {
url := fmt.Sprintf("http://%s/status", httpAddr)