mirror of
https://github.com/DeBrosOfficial/network.git
synced 2025-12-11 08:18:49 +00:00
feat: enhance service shutdown and logging in development environment
- Improved the `stop` target in the Makefile to ensure graceful shutdown of development services, allowing for a more reliable process termination. - Updated the `StopAll` method in the ProcessManager to provide clearer logging during service shutdown, including progress updates and error handling. - Added a new `PushNotificationService` to handle sending push notifications via Expo, including bulk notification capabilities and improved error handling. - Refactored RQLite management to streamline node identification and logging, ensuring consistent behavior across node types during startup and recovery.
This commit is contained in:
parent
33ebf222ff
commit
4c1f842939
36
CHANGELOG.md
36
CHANGELOG.md
@ -13,6 +13,42 @@ The format is based on [Keep a Changelog][keepachangelog] and adheres to [Semant
|
||||
### Deprecated
|
||||
|
||||
### Fixed
|
||||
## [0.69.17] - 2025-11-21
|
||||
|
||||
### Added
|
||||
- Initial implementation of a Push Notification Service for the Gateway, utilizing the Expo API.
|
||||
- Detailed documentation for RQLite operations, monitoring, and troubleshooting was added to the README.
|
||||
|
||||
### Changed
|
||||
- Improved `make stop` and `dbn dev down` commands to ensure all development services are forcefully killed after graceful shutdown attempt.
|
||||
- Refactored RQLite startup logic to simplify cluster establishment and remove complex, error-prone leadership/recovery checks, relying on RQLite's built-in join mechanism.
|
||||
- RQLite logs are now written to individual log files (e.g., `~/.debros/logs/rqlite-bootstrap.log`) instead of stdout/stderr, improving development environment clarity.
|
||||
- Improved peer exchange discovery logging to suppress expected 'protocols not supported' warnings from lightweight clients like the Gateway.
|
||||
|
||||
### Deprecated
|
||||
|
||||
### Removed
|
||||
|
||||
### Fixed
|
||||
\n
|
||||
## [0.69.17] - 2025-11-21
|
||||
|
||||
### Added
|
||||
- Initial implementation of a Push Notification Service for the Gateway, utilizing the Expo API.
|
||||
- Detailed documentation for RQLite operations, monitoring, and troubleshooting in the README.
|
||||
|
||||
### Changed
|
||||
- Improved `make stop` and `dbn dev down` commands to ensure all development services are forcefully killed after graceful shutdown attempt.
|
||||
- Refactored RQLite startup logic to simplify cluster establishment and remove complex, error-prone leadership/recovery checks, relying on RQLite's built-in join mechanism.
|
||||
- RQLite logs are now written to individual log files (e.g., `~/.debros/logs/rqlite-bootstrap.log`) instead of stdout/stderr, improving development environment clarity.
|
||||
- Improved peer exchange discovery logging to suppress expected 'protocols not supported' warnings from lightweight clients like the Gateway.
|
||||
|
||||
### Deprecated
|
||||
|
||||
### Removed
|
||||
|
||||
### Fixed
|
||||
\n
|
||||
## [0.69.16] - 2025-11-16
|
||||
|
||||
### Added
|
||||
|
||||
8
Makefile
8
Makefile
@ -19,7 +19,7 @@ test-e2e:
|
||||
|
||||
.PHONY: build clean test run-node run-node2 run-node3 run-example deps tidy fmt vet lint clear-ports install-hooks kill
|
||||
|
||||
VERSION := 0.69.16
|
||||
VERSION := 0.69.17
|
||||
COMMIT ?= $(shell git rev-parse --short HEAD 2>/dev/null || echo unknown)
|
||||
DATE ?= $(shell date -u +%Y-%m-%dT%H:%M:%SZ)
|
||||
LDFLAGS := -X 'main.version=$(VERSION)' -X 'main.commit=$(COMMIT)' -X 'main.date=$(DATE)'
|
||||
@ -88,11 +88,9 @@ dev: build
|
||||
# Graceful shutdown of all dev services
|
||||
stop:
|
||||
@if [ -f ./bin/dbn ]; then \
|
||||
./bin/dbn dev down; \
|
||||
else \
|
||||
echo "⚠️ dbn binary not found, using force kill instead..."; \
|
||||
bash scripts/dev-kill-all.sh; \
|
||||
./bin/dbn dev down || true; \
|
||||
fi
|
||||
@bash scripts/dev-kill-all.sh
|
||||
|
||||
# Force kill all processes (immediate termination)
|
||||
kill:
|
||||
|
||||
335
README.md
335
README.md
@ -590,15 +590,350 @@ Common endpoints (see `openapi/gateway.yaml` for the full spec):
|
||||
- `POST /v1/pubsub/publish`, `GET /v1/pubsub/topics`, `GET /v1/pubsub/ws?topic=<topic>`
|
||||
- `POST /v1/storage/upload`, `POST /v1/storage/pin`, `GET /v1/storage/status/:cid`, `GET /v1/storage/get/:cid`, `DELETE /v1/storage/unpin/:cid`
|
||||
|
||||
## RQLite Operations & Monitoring
|
||||
|
||||
RQLite is the distributed SQL database backing DeBros Network. Proper monitoring and maintenance are critical for cluster health.
|
||||
|
||||
### Connecting to RQLite
|
||||
|
||||
```bash
|
||||
# Local development (bootstrap) - port 5001
|
||||
rqlite -H localhost -p 5001
|
||||
|
||||
# Local development (bootstrap2) - port 5011
|
||||
rqlite -H localhost -p 5011
|
||||
|
||||
# Production nodes
|
||||
rqlite -H 192.168.1.151 -p 5001
|
||||
```
|
||||
|
||||
### Health Checks (CRITICAL for Cluster Health)
|
||||
|
||||
```bash
|
||||
# Check node status and diagnostics
|
||||
rqlite -H localhost -p 5001 ".status"
|
||||
|
||||
# List all nodes in cluster (verify all nodes connected)
|
||||
rqlite -H localhost -p 5001 ".nodes"
|
||||
|
||||
# Check if node is ready for operations
|
||||
rqlite -H localhost -p 5001 ".ready"
|
||||
|
||||
# Get Go runtime info (goroutines, memory, performance)
|
||||
rqlite -H localhost -p 5001 ".expvar"
|
||||
|
||||
# Show all tables
|
||||
rqlite -H localhost -p 5001 ".tables"
|
||||
|
||||
# Show schema (CREATE statements)
|
||||
rqlite -H localhost -p 5001 ".schema"
|
||||
|
||||
# Show all indexes
|
||||
rqlite -H localhost -p 5001 ".indexes"
|
||||
```
|
||||
|
||||
### Backup & Restore
|
||||
|
||||
```bash
|
||||
# Backup database
|
||||
rqlite -H localhost -p 5001 ".backup ~/rqlite-backup.db"
|
||||
|
||||
# Restore from backup
|
||||
rqlite -H localhost -p 5001 ".restore ~/rqlite-backup.db"
|
||||
|
||||
# Dump database in SQL text format
|
||||
rqlite -H localhost -p 5001 ".dump ~/rqlite-dump.sql"
|
||||
```
|
||||
|
||||
### Consistency Levels (Important for Data Integrity)
|
||||
|
||||
RQLite supports three consistency levels for read operations:
|
||||
|
||||
```bash
|
||||
# View current consistency level
|
||||
rqlite -H localhost -p 5001 ".consistency"
|
||||
|
||||
# Set to weak (default, good balance for most applications)
|
||||
rqlite -H localhost -p 5001 ".consistency weak"
|
||||
|
||||
# Set to strong (guaranteed consistency across entire cluster)
|
||||
rqlite -H localhost -p 5001 ".consistency strong"
|
||||
|
||||
# Set to none (fastest reads, no consistency guarantees)
|
||||
rqlite -H localhost -p 5001 ".consistency none"
|
||||
```
|
||||
|
||||
**Recommendation**: Use `weak` for general operations, `strong` when data integrity is critical, and `none` only for cache-like data.
|
||||
|
||||
### Cluster Management
|
||||
|
||||
```bash
|
||||
# Show detailed cluster diagnostics
|
||||
rqlite -H localhost -p 5001 ".sysdump /tmp/rqlite-diagnostic.txt"
|
||||
|
||||
# Remove a node from cluster (use raft ID from .nodes output)
|
||||
rqlite -H localhost -p 5001 ".remove <raft_id>"
|
||||
```
|
||||
|
||||
### RQLite Log Files (Development)
|
||||
|
||||
All RQLite logs are now written to individual files for easier debugging:
|
||||
|
||||
```
|
||||
~/.debros/logs/rqlite-bootstrap.log
|
||||
~/.debros/logs/rqlite-bootstrap2.log
|
||||
~/.debros/logs/rqlite-node2.log
|
||||
~/.debros/logs/rqlite-node3.log
|
||||
~/.debros/logs/rqlite-node4.log
|
||||
```
|
||||
|
||||
View logs:
|
||||
|
||||
```bash
|
||||
tail -f ~/.debros/logs/rqlite-bootstrap.log
|
||||
tail -f ~/.debros/logs/rqlite-node2.log
|
||||
dbn dev logs rqlite-bootstrap --follow
|
||||
```
|
||||
|
||||
## Development Environment Operations
|
||||
|
||||
### Starting & Managing Development Environment
|
||||
|
||||
```bash
|
||||
# Start the complete development stack (2 bootstraps + 3 nodes + gateway)
|
||||
make dev
|
||||
|
||||
# Check status of running services
|
||||
dbn dev status
|
||||
|
||||
# Stop all services
|
||||
dbn dev down
|
||||
```
|
||||
|
||||
### Development Logs
|
||||
|
||||
```bash
|
||||
# View logs for specific component
|
||||
dbn dev logs bootstrap
|
||||
dbn dev logs bootstrap2
|
||||
dbn dev logs node2
|
||||
dbn dev logs node3
|
||||
dbn dev logs node4
|
||||
dbn dev logs gateway
|
||||
dbn dev logs olric
|
||||
dbn dev logs anon
|
||||
|
||||
# Follow logs in real-time (like tail -f)
|
||||
dbn dev logs bootstrap --follow
|
||||
dbn dev logs rqlite-bootstrap --follow
|
||||
```
|
||||
|
||||
### Key Development Endpoints
|
||||
|
||||
```
|
||||
Gateway: http://localhost:6001
|
||||
Bootstrap IPFS: http://localhost:4501
|
||||
Bootstrap2 IPFS: http://localhost:4511
|
||||
Node2 IPFS: http://localhost:4502
|
||||
Node3 IPFS: http://localhost:4503
|
||||
Node4 IPFS: http://localhost:4504
|
||||
Anon SOCKS: 127.0.0.1:9050
|
||||
Olric Cache: http://localhost:3320
|
||||
RQLite Bootstrap: http://localhost:5001
|
||||
RQLite Bootstrap2: http://localhost:5011
|
||||
RQLite Node2: http://localhost:5002
|
||||
RQLite Node3: http://localhost:5003
|
||||
RQLite Node4: http://localhost:5004
|
||||
```
|
||||
|
||||
## IPFS Configuration
|
||||
|
||||
### Ensure Consistent Cluster Setup
|
||||
|
||||
All nodes in a cluster must have identical `cluster.secret` and `swarm.key`:
|
||||
|
||||
```bash
|
||||
# Copy swarm key to each host (adjust path for bootstrap vs node):
|
||||
|
||||
# Bootstrap node
|
||||
sudo cp /home/debros/.debros/secrets/swarm.key /home/debros/.debros/data/bootstrap/ipfs/repo/swarm.key
|
||||
|
||||
# Regular nodes
|
||||
sudo cp /home/debros/.debros/secrets/swarm.key /home/debros/.debros/data/node/ipfs/repo/swarm.key
|
||||
|
||||
# Fix permissions
|
||||
sudo chown debros:debros /home/debros/.debros/data/*/ipfs/repo/swarm.key
|
||||
sudo chmod 600 /home/debros/.debros/data/*/ipfs/repo/swarm.key
|
||||
```
|
||||
|
||||
### Important IPFS Configuration Notes
|
||||
|
||||
- **Production**: Update Olric config - change `0.0.0.0` to actual IP address for both entries
|
||||
- **All Nodes**: Must have identical `cluster.secret` and `swarm.key` for cluster to form
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### General Issues
|
||||
|
||||
- **Config directory errors**: Ensure `~/.debros/` exists, is writable, and has free disk space (`touch ~/.debros/test && rm ~/.debros/test`).
|
||||
- **Port conflicts**: Inspect with `lsof -i :4001` (or other ports) and stop conflicting processes or regenerate configs with new ports.
|
||||
- **Missing configs**: Run `./bin/dbn config init` before starting nodes.
|
||||
- **Cluster join issues**: Confirm the bootstrap node is running, `peer.info` multiaddr matches `bootstrap_peers`, and firewall rules allow the P2P ports.
|
||||
|
||||
### RQLite Troubleshooting
|
||||
|
||||
#### Cluster Not Forming
|
||||
|
||||
```bash
|
||||
# Verify all nodes see each other
|
||||
rqlite -H localhost -p 5001 ".nodes"
|
||||
|
||||
# Check node readiness
|
||||
rqlite -H localhost -p 5001 ".ready"
|
||||
|
||||
# Check status and Raft logs
|
||||
rqlite -H localhost -p 5001 ".status"
|
||||
```
|
||||
|
||||
#### Broken RQLite Raft (Production)
|
||||
|
||||
```bash
|
||||
# Fix RQLite Raft consensus
|
||||
sudo env HOME=/home/debros network-cli rqlite fix
|
||||
```
|
||||
|
||||
#### Reset RQLite State (DESTRUCTIVE - Last Resort Only)
|
||||
|
||||
```bash
|
||||
# ⚠️ WARNING: This destroys all RQLite data!
|
||||
rm -f ~/.debros/data/rqlite/raft.db
|
||||
rm -f ~/.debros/data/rqlite/raft/peers.json
|
||||
```
|
||||
|
||||
#### Kill IPFS Cluster Service
|
||||
|
||||
```bash
|
||||
pkill -f ipfs-cluster-service
|
||||
```
|
||||
|
||||
### Services Not Starting
|
||||
|
||||
```bash
|
||||
# Check service status
|
||||
systemctl status debros-node-bootstrap
|
||||
|
||||
# View detailed logs
|
||||
journalctl -u debros-node-bootstrap -n 100
|
||||
|
||||
# Check log files
|
||||
tail -f /home/debros/.debros/logs/node-bootstrap.log
|
||||
```
|
||||
|
||||
### Port Conflicts
|
||||
|
||||
```bash
|
||||
# Check what's using specific ports
|
||||
sudo lsof -i :4001 # P2P port
|
||||
sudo lsof -i :5001 # RQLite HTTP
|
||||
sudo lsof -i :6001 # Gateway
|
||||
sudo lsof -i :9094 # IPFS Cluster API
|
||||
|
||||
# Kill all DeBros-related processes (except Anyone on 9050)
|
||||
lsof -ti:7001,7002,7003,5001,5002,5003,6001,4001,3320,3322,9094 | xargs kill -9 2>/dev/null && echo "Killed processes" || echo "No processes found"
|
||||
```
|
||||
|
||||
### Systemd Service Management
|
||||
|
||||
```bash
|
||||
# Stop all services (keeps Anyone proxy running on 9050)
|
||||
sudo systemctl stop debros-*
|
||||
|
||||
# Disable services from auto-start
|
||||
sudo systemctl disable debros-*
|
||||
|
||||
# Restart all services
|
||||
sudo systemctl restart debros-*
|
||||
|
||||
# Enable services for auto-start on boot
|
||||
sudo systemctl enable debros-*
|
||||
|
||||
# View all DeBros services
|
||||
systemctl list-units 'debros-*'
|
||||
|
||||
# Clean up failed services
|
||||
sudo systemctl reset-failed
|
||||
```
|
||||
|
||||
### Reset Installation (⚠️ Destroys All Data)
|
||||
|
||||
```bash
|
||||
# Start fresh (production)
|
||||
sudo dbn prod uninstall
|
||||
sudo rm -rf /home/debros/.debros
|
||||
sudo dbn prod install --bootstrap --branch nightly
|
||||
```
|
||||
|
||||
## Operations Cheat Sheet
|
||||
|
||||
### User Management (Linux)
|
||||
|
||||
```bash
|
||||
# Switch to DeBros user
|
||||
sudo -u debros bash
|
||||
|
||||
# Kill all DeBros user processes
|
||||
sudo killall -9 -u debros
|
||||
|
||||
# Remove DeBros user completely
|
||||
sudo userdel -r -f debros
|
||||
```
|
||||
|
||||
### Installation & Deployment
|
||||
|
||||
```bash
|
||||
# Local development
|
||||
make dev
|
||||
|
||||
# Install nightly branch
|
||||
wget https://raw.githubusercontent.com/DeBrosOfficial/network/refs/heads/nightly/scripts/install-debros-network.sh
|
||||
chmod +x ./install-debros-network.sh
|
||||
./install-debros-network.sh --prerelease --nightly
|
||||
|
||||
# Production bootstrap node
|
||||
sudo dbn prod install --bootstrap --branch nightly
|
||||
|
||||
# Production secondary node
|
||||
sudo dbn prod install \
|
||||
--vps-ip <your_ip> \
|
||||
--peers /ip4/<bootstrap_ip>/tcp/4001/p2p/<peer_id> \
|
||||
--branch nightly
|
||||
```
|
||||
|
||||
### Configuration & Sudoers (Deploy User)
|
||||
|
||||
```bash
|
||||
# Add to sudoers for deploy automation
|
||||
ubuntu ALL=(ALL) NOPASSWD: /bin/bash
|
||||
ubuntu ALL=(ALL) NOPASSWD: /usr/bin/make
|
||||
|
||||
# Git configuration
|
||||
git config --global --add safe.directory /home/debros/src
|
||||
```
|
||||
|
||||
### Authentication
|
||||
|
||||
```bash
|
||||
# Login to gateway
|
||||
env DEBROS_GATEWAY_URL=https://node-kv4la8.debros.network dbn auth login
|
||||
```
|
||||
|
||||
## Resources
|
||||
|
||||
- [RQLite CLI Documentation](https://rqlite.io/docs/cli/)
|
||||
- [RQLite Features](https://rqlite.io/docs/features/)
|
||||
- [RQLite Clustering Guide](https://rqlite.io/docs/clustering/)
|
||||
- [RQLite Security](https://rqlite.io/docs/security/)
|
||||
- [RQLite Backup & Restore](https://rqlite.io/docs/backup-and-restore/)
|
||||
- Go modules: `go mod tidy`, `go test ./...`
|
||||
- Automation: `make build`, `make dev`, `make run-gateway`, `make lint`
|
||||
- API reference: `openapi/gateway.yaml`
|
||||
|
||||
@ -136,7 +136,10 @@ func handleDevDown(args []string) {
|
||||
|
||||
if err := pm.StopAll(ctx); err != nil {
|
||||
fmt.Fprintf(os.Stderr, "⚠️ Error stopping services: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
fmt.Printf("✅ All services have been stopped\n\n")
|
||||
}
|
||||
|
||||
func handleDevStatus(args []string) {
|
||||
|
||||
@ -6,6 +6,7 @@ import (
|
||||
"errors"
|
||||
"io"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/libp2p/go-libp2p/core/host"
|
||||
@ -420,11 +421,20 @@ func (d *Manager) requestPeersFromPeer(ctx context.Context, peerID peer.ID, limi
|
||||
// Open a stream to the peer
|
||||
stream, err := d.host.NewStream(ctx, peerID, PeerExchangeProtocol)
|
||||
if err != nil {
|
||||
// Suppress repeated warnings for the same peer (log once per minute max)
|
||||
// Check if this is a "protocols not supported" error (expected for lightweight clients like gateway)
|
||||
if strings.Contains(err.Error(), "protocols not supported") {
|
||||
// This is a lightweight client (gateway, etc.) that doesn't support peer exchange - expected behavior
|
||||
// Track it to avoid repeated attempts, but don't log as it's not an error
|
||||
d.failedPeerExchanges[peerID] = time.Now()
|
||||
return nil
|
||||
}
|
||||
|
||||
// For actual connection errors, log but suppress repeated warnings for the same peer
|
||||
lastFailure, seen := d.failedPeerExchanges[peerID]
|
||||
if !seen || time.Since(lastFailure) > time.Minute {
|
||||
d.logger.Debug("Failed to open peer exchange stream",
|
||||
d.logger.Debug("Failed to open peer exchange stream with node",
|
||||
zap.String("peer_id", peerID.String()[:8]+"..."),
|
||||
zap.String("reason", "peer does not support peer exchange protocol or connection failed"),
|
||||
zap.Error(err))
|
||||
d.failedPeerExchanges[peerID] = time.Now()
|
||||
}
|
||||
|
||||
@ -95,7 +95,7 @@ func (pm *ProcessManager) StartAll(ctx context.Context) error {
|
||||
|
||||
// StopAll stops all running processes
|
||||
func (pm *ProcessManager) StopAll(ctx context.Context) error {
|
||||
fmt.Fprintf(pm.logWriter, "\n🛑 Stopping development environment...\n")
|
||||
fmt.Fprintf(pm.logWriter, "\n🛑 Stopping development environment...\n\n")
|
||||
|
||||
topology := DefaultTopology()
|
||||
var services []string
|
||||
@ -116,11 +116,22 @@ func (pm *ProcessManager) StopAll(ctx context.Context) error {
|
||||
}
|
||||
services = append(services, "olric", "anon")
|
||||
|
||||
fmt.Fprintf(pm.logWriter, "Stopping %d services...\n\n", len(services))
|
||||
|
||||
// Stop all processes sequentially (in dependency order) and wait for each
|
||||
stoppedCount := 0
|
||||
for _, svc := range services {
|
||||
pm.stopProcess(svc)
|
||||
if err := pm.stopProcess(svc); err != nil {
|
||||
fmt.Fprintf(pm.logWriter, "⚠️ Error stopping %s: %v\n", svc, err)
|
||||
} else {
|
||||
stoppedCount++
|
||||
}
|
||||
|
||||
// Show progress
|
||||
fmt.Fprintf(pm.logWriter, " [%d/%d] stopped\n", stoppedCount, len(services))
|
||||
}
|
||||
|
||||
fmt.Fprintf(pm.logWriter, "✓ All services stopped\n\n")
|
||||
fmt.Fprintf(pm.logWriter, "\n✅ All %d services have been stopped\n\n", stoppedCount)
|
||||
return nil
|
||||
}
|
||||
|
||||
|
||||
184
pkg/gateway/push_notifications.go
Normal file
184
pkg/gateway/push_notifications.go
Normal file
@ -0,0 +1,184 @@
|
||||
package gateway
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"time"
|
||||
|
||||
"go.uber.org/zap"
|
||||
)
|
||||
|
||||
// PushNotificationService handles sending push notifications via Expo
|
||||
type PushNotificationService struct {
|
||||
logger *zap.Logger
|
||||
client *http.Client
|
||||
}
|
||||
|
||||
// ExpoTicket represents the response from Expo API
|
||||
type ExpoTicket struct {
|
||||
ID string `json:"id"`
|
||||
Error string `json:"error,omitempty"`
|
||||
}
|
||||
|
||||
// ExpoPushMessage represents a message to send via Expo
|
||||
type ExpoPushMessage struct {
|
||||
To string `json:"to"`
|
||||
Title string `json:"title"`
|
||||
Body string `json:"body"`
|
||||
Data map[string]interface{} `json:"data,omitempty"`
|
||||
Sound string `json:"sound,omitempty"`
|
||||
Badge int `json:"badge,omitempty"`
|
||||
Priority string `json:"priority,omitempty"`
|
||||
// iOS specific
|
||||
MutableContent bool `json:"mutableContent,omitempty"`
|
||||
IosIcon string `json:"iosIcon,omitempty"`
|
||||
// Android specific
|
||||
AndroidBigLargeIcon string `json:"androidBigLargeIcon,omitempty"`
|
||||
ChannelID string `json:"channelId,omitempty"`
|
||||
}
|
||||
|
||||
// NewPushNotificationService creates a new push notification service
|
||||
func NewPushNotificationService(logger *zap.Logger) *PushNotificationService {
|
||||
return &PushNotificationService{
|
||||
logger: logger,
|
||||
client: &http.Client{
|
||||
Timeout: 10 * time.Second,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// SendNotification sends a push notification via Expo
|
||||
func (pns *PushNotificationService) SendNotification(
|
||||
ctx context.Context,
|
||||
expoPushToken string,
|
||||
title string,
|
||||
body string,
|
||||
data map[string]interface{},
|
||||
avatarURL string,
|
||||
) error {
|
||||
if expoPushToken == "" {
|
||||
return fmt.Errorf("empty expo push token")
|
||||
}
|
||||
|
||||
message := ExpoPushMessage{
|
||||
To: expoPushToken,
|
||||
Title: title,
|
||||
Body: body,
|
||||
Data: data,
|
||||
Sound: "default",
|
||||
Priority: "high",
|
||||
// Enable mutable content for iOS to allow Notification Service Extension
|
||||
MutableContent: true,
|
||||
ChannelID: "messages",
|
||||
AndroidBigLargeIcon: avatarURL,
|
||||
}
|
||||
|
||||
// For iOS, include avatar in data so Notification Service Extension can fetch it
|
||||
if avatarURL != "" {
|
||||
if message.Data == nil {
|
||||
message.Data = make(map[string]interface{})
|
||||
}
|
||||
message.Data["avatar_url"] = avatarURL
|
||||
}
|
||||
|
||||
return pns.sendExpoRequest(ctx, message)
|
||||
}
|
||||
|
||||
// SendBulkNotifications sends notifications to multiple users
|
||||
func (pns *PushNotificationService) SendBulkNotifications(
|
||||
ctx context.Context,
|
||||
expoPushTokens []string,
|
||||
title string,
|
||||
body string,
|
||||
data map[string]interface{},
|
||||
avatarURL string,
|
||||
) []error {
|
||||
errors := make([]error, 0)
|
||||
|
||||
for _, token := range expoPushTokens {
|
||||
if err := pns.SendNotification(ctx, token, title, body, data, avatarURL); err != nil {
|
||||
errors = append(errors, fmt.Errorf("failed to send to token %s: %w", token, err))
|
||||
}
|
||||
}
|
||||
|
||||
return errors
|
||||
}
|
||||
|
||||
// sendExpoRequest sends a request to the Expo push notification API
|
||||
func (pns *PushNotificationService) sendExpoRequest(ctx context.Context, message ExpoPushMessage) error {
|
||||
const expoAPIURL = "https://exp.host/--/api/v2/push/send"
|
||||
|
||||
body, err := json.Marshal(message)
|
||||
if err != nil {
|
||||
pns.logger.Error("failed to marshal push notification",
|
||||
zap.Error(err),
|
||||
zap.String("to", message.To))
|
||||
return fmt.Errorf("marshal error: %w", err)
|
||||
}
|
||||
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodPost, expoAPIURL, bytes.NewBuffer(body))
|
||||
if err != nil {
|
||||
pns.logger.Error("failed to create push notification request",
|
||||
zap.Error(err),
|
||||
zap.String("to", message.To))
|
||||
return fmt.Errorf("request creation error: %w", err)
|
||||
}
|
||||
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
|
||||
resp, err := pns.client.Do(req)
|
||||
if err != nil {
|
||||
pns.logger.Error("failed to send push notification",
|
||||
zap.Error(err),
|
||||
zap.String("to", message.To))
|
||||
return fmt.Errorf("send error: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
respBody, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
pns.logger.Error("failed to read push notification response",
|
||||
zap.Error(err),
|
||||
zap.String("to", message.To))
|
||||
return fmt.Errorf("response read error: %w", err)
|
||||
}
|
||||
|
||||
// Check for API errors
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
pns.logger.Warn("push notification API error",
|
||||
zap.Int("status_code", resp.StatusCode),
|
||||
zap.String("response", string(respBody)),
|
||||
zap.String("to", message.To))
|
||||
return fmt.Errorf("API returned status %d: %s", resp.StatusCode, string(respBody))
|
||||
}
|
||||
|
||||
// Parse response
|
||||
var tickets []ExpoTicket
|
||||
if err := json.Unmarshal(respBody, &tickets); err != nil {
|
||||
pns.logger.Error("failed to parse push notification response",
|
||||
zap.Error(err),
|
||||
zap.String("response", string(respBody)))
|
||||
return fmt.Errorf("parse error: %w", err)
|
||||
}
|
||||
|
||||
// Check for errors in tickets
|
||||
for _, ticket := range tickets {
|
||||
if ticket.Error != "" {
|
||||
pns.logger.Warn("push notification error in ticket",
|
||||
zap.String("error", ticket.Error),
|
||||
zap.String("to", message.To))
|
||||
return fmt.Errorf("ticket error: %s", ticket.Error)
|
||||
}
|
||||
}
|
||||
|
||||
pns.logger.Info("push notification sent successfully",
|
||||
zap.String("to", message.To),
|
||||
zap.String("title", message.Title))
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
@ -70,15 +70,26 @@ func NewNode(cfg *config.Config) (*Node, error) {
|
||||
func (n *Node) startRQLite(ctx context.Context) error {
|
||||
n.logger.Info("Starting RQLite database")
|
||||
|
||||
// Determine node identifier for log filename - use node ID for unique filenames
|
||||
nodeID := n.config.Node.ID
|
||||
if nodeID == "" {
|
||||
// Fallback to type if ID is not set
|
||||
nodeID = n.config.Node.Type
|
||||
if nodeID == "" {
|
||||
nodeID = "node"
|
||||
}
|
||||
}
|
||||
|
||||
// Create RQLite manager
|
||||
n.rqliteManager = database.NewRQLiteManager(&n.config.Database, &n.config.Discovery, n.config.Node.DataDir, n.logger.Logger)
|
||||
n.rqliteManager.SetNodeType(nodeID)
|
||||
|
||||
// Initialize cluster discovery service if LibP2P host is available
|
||||
if n.host != nil && n.discoveryManager != nil {
|
||||
// Determine node type
|
||||
nodeType := "node"
|
||||
// Determine node type for cluster discovery (bootstrap or node)
|
||||
discoveryNodeType := "node"
|
||||
if n.config.Node.Type == "bootstrap" {
|
||||
nodeType = "bootstrap"
|
||||
discoveryNodeType = "bootstrap"
|
||||
}
|
||||
|
||||
// Create cluster discovery service
|
||||
@ -87,7 +98,7 @@ func (n *Node) startRQLite(ctx context.Context) error {
|
||||
n.discoveryManager,
|
||||
n.rqliteManager,
|
||||
n.config.Node.ID,
|
||||
nodeType,
|
||||
discoveryNodeType,
|
||||
n.config.Discovery.RaftAdvAddress,
|
||||
n.config.Discovery.HttpAdvAddress,
|
||||
n.config.Node.DataDir,
|
||||
|
||||
@ -633,11 +633,7 @@ func (c *ClusterDiscoveryService) WaitForDiscoverySettling(ctx context.Context)
|
||||
|
||||
// TriggerSync manually triggers a cluster membership sync
|
||||
func (c *ClusterDiscoveryService) TriggerSync() {
|
||||
// For bootstrap nodes, wait a bit for peer discovery to stabilize
|
||||
if c.nodeType == "bootstrap" {
|
||||
time.Sleep(5 * time.Second)
|
||||
}
|
||||
|
||||
// All nodes use the same discovery timing for consistency
|
||||
c.updateClusterMembership()
|
||||
}
|
||||
|
||||
|
||||
@ -10,7 +10,6 @@ import (
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
"syscall"
|
||||
"time"
|
||||
@ -26,6 +25,7 @@ type RQLiteManager struct {
|
||||
config *config.DatabaseConfig
|
||||
discoverConfig *config.DiscoveryConfig
|
||||
dataDir string
|
||||
nodeType string // "bootstrap" or "node"
|
||||
logger *zap.Logger
|
||||
cmd *exec.Cmd
|
||||
connection *gorqlite.Connection
|
||||
@ -81,6 +81,13 @@ func (r *RQLiteManager) SetDiscoveryService(service *ClusterDiscoveryService) {
|
||||
r.discoveryService = service
|
||||
}
|
||||
|
||||
// SetNodeType sets the node type for this RQLite manager ("bootstrap" or "node")
|
||||
func (r *RQLiteManager) SetNodeType(nodeType string) {
|
||||
if nodeType != "" {
|
||||
r.nodeType = nodeType
|
||||
}
|
||||
}
|
||||
|
||||
// UpdateAdvertisedAddresses overrides the discovery advertised addresses when cluster discovery
|
||||
// infers a better host than what was provided via configuration (e.g. replacing localhost).
|
||||
func (r *RQLiteManager) UpdateAdvertisedAddresses(raftAddr, httpAddr string) {
|
||||
@ -233,7 +240,7 @@ func (r *RQLiteManager) launchProcess(ctx context.Context, rqliteDataDir string)
|
||||
"-raft-addr", fmt.Sprintf("0.0.0.0:%d", r.config.RQLiteRaftPort),
|
||||
}
|
||||
|
||||
// Add join address if specified (for non-bootstrap or secondary bootstrap nodes)
|
||||
// All nodes follow the same join logic - either join specified address or start as single-node cluster
|
||||
if r.config.RQLiteJoinAddress != "" {
|
||||
r.logger.Info("Joining RQLite cluster", zap.String("join_address", r.config.RQLiteJoinAddress))
|
||||
|
||||
@ -259,28 +266,9 @@ func (r *RQLiteManager) launchProcess(ctx context.Context, rqliteDataDir string)
|
||||
// Add retry parameters to handle slow cluster startup (e.g., during recovery)
|
||||
args = append(args, "-join", joinArg, "-join-attempts", "30", "-join-interval", "10s")
|
||||
} else {
|
||||
r.logger.Info("No join address specified - starting as new cluster")
|
||||
|
||||
// For bootstrap nodes, use bootstrap-expect if we know about other peers
|
||||
if r.discoveryService != nil {
|
||||
allPeers := r.discoveryService.GetAllPeers()
|
||||
remotePeerCount := 0
|
||||
for _, peer := range allPeers {
|
||||
if peer.NodeID != r.discoverConfig.RaftAdvAddress {
|
||||
remotePeerCount++
|
||||
}
|
||||
}
|
||||
|
||||
// Use bootstrap-expect if we have discovered enough peers
|
||||
// This tells RQLite to wait for the expected number of nodes before forming cluster
|
||||
if remotePeerCount >= (r.config.MinClusterSize - 1) {
|
||||
expectedPeers := r.config.MinClusterSize
|
||||
args = append(args, "-bootstrap-expect", strconv.Itoa(expectedPeers))
|
||||
r.logger.Info("Using bootstrap-expect to wait for cluster formation",
|
||||
zap.Int("expected_peers", expectedPeers),
|
||||
zap.Int("remote_peers_discovered", remotePeerCount))
|
||||
}
|
||||
}
|
||||
r.logger.Info("No join address specified - starting as single-node cluster")
|
||||
// When no join address is provided, rqlited will start as a single-node cluster
|
||||
// This is expected for the first node in a fresh cluster
|
||||
}
|
||||
|
||||
// Add data directory as positional argument
|
||||
@ -295,14 +283,41 @@ func (r *RQLiteManager) launchProcess(ctx context.Context, rqliteDataDir string)
|
||||
// Start RQLite process (not bound to ctx for graceful Stop handling)
|
||||
r.cmd = exec.Command("rqlited", args...)
|
||||
|
||||
// Enable debug logging of RQLite process to help diagnose issues
|
||||
r.cmd.Stdout = os.Stdout
|
||||
r.cmd.Stderr = os.Stderr
|
||||
// Setup log file for RQLite output
|
||||
// Determine node type for log filename
|
||||
nodeType := r.nodeType
|
||||
if nodeType == "" {
|
||||
nodeType = "node"
|
||||
}
|
||||
|
||||
// Create logs directory
|
||||
logsDir := filepath.Join(filepath.Dir(r.dataDir), "logs")
|
||||
if err := os.MkdirAll(logsDir, 0755); err != nil {
|
||||
return fmt.Errorf("failed to create logs directory at %s: %w", logsDir, err)
|
||||
}
|
||||
|
||||
// Open log file for RQLite output
|
||||
logPath := filepath.Join(logsDir, fmt.Sprintf("rqlite-%s.log", nodeType))
|
||||
logFile, err := os.OpenFile(logPath, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0644)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to open RQLite log file at %s: %w", logPath, err)
|
||||
}
|
||||
|
||||
r.logger.Info("RQLite logs will be written to file",
|
||||
zap.String("path", logPath))
|
||||
|
||||
r.cmd.Stdout = logFile
|
||||
r.cmd.Stderr = logFile
|
||||
|
||||
if err := r.cmd.Start(); err != nil {
|
||||
logFile.Close()
|
||||
return fmt.Errorf("failed to start RQLite: %w", err)
|
||||
}
|
||||
|
||||
// Close the log file handle after process starts (the subprocess maintains its own reference)
|
||||
// This allows the file to be rotated or inspected while the process is running
|
||||
logFile.Close()
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
@ -337,19 +352,18 @@ func (r *RQLiteManager) waitForReadyAndConnect(ctx context.Context) error {
|
||||
// Check if error is "store is not open" (recovery scenario)
|
||||
if strings.Contains(err.Error(), "store is not open") {
|
||||
if attempt < maxConnectAttempts-1 {
|
||||
// Only retry for joining nodes; bootstrap nodes should fail fast
|
||||
if r.config.RQLiteJoinAddress != "" {
|
||||
if attempt%3 == 0 {
|
||||
r.logger.Debug("RQLite store not yet accessible for connection, retrying...",
|
||||
zap.Int("attempt", attempt+1), zap.Error(err))
|
||||
}
|
||||
time.Sleep(connectBackoff)
|
||||
connectBackoff = time.Duration(float64(connectBackoff) * 1.5)
|
||||
if connectBackoff > 5*time.Second {
|
||||
connectBackoff = 5 * time.Second
|
||||
}
|
||||
continue
|
||||
// Retry with exponential backoff for all nodes during recovery
|
||||
// The store may not open immediately, especially during cluster recovery
|
||||
if attempt%3 == 0 {
|
||||
r.logger.Debug("RQLite store not yet accessible for connection, retrying...",
|
||||
zap.Int("attempt", attempt+1), zap.Error(err))
|
||||
}
|
||||
time.Sleep(connectBackoff)
|
||||
connectBackoff = time.Duration(float64(connectBackoff) * 1.5)
|
||||
if connectBackoff > 5*time.Second {
|
||||
connectBackoff = 5 * time.Second
|
||||
}
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
@ -376,130 +390,52 @@ func (r *RQLiteManager) waitForReadyAndConnect(ctx context.Context) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// establishLeadershipOrJoin establishes leadership (bootstrap) or waits for SQL availability (joining)
|
||||
// establishLeadershipOrJoin handles post-startup cluster establishment
|
||||
// All nodes follow the same pattern: wait for SQL availability
|
||||
// For nodes without a join address, RQLite automatically forms a single-node cluster and becomes leader
|
||||
func (r *RQLiteManager) establishLeadershipOrJoin(ctx context.Context, rqliteDataDir string) error {
|
||||
if r.config.RQLiteJoinAddress == "" {
|
||||
// Bootstrap node logic with data safety checks
|
||||
r.logger.Info("Bootstrap node: checking if safe to lead")
|
||||
// First node - no join address specified
|
||||
// RQLite will automatically form a single-node cluster and become leader
|
||||
r.logger.Info("Starting as first node in cluster")
|
||||
|
||||
// SAFETY: Check if we can safely become leader
|
||||
canLead, err := r.canSafelyBecomeLeader()
|
||||
if !canLead && err != nil {
|
||||
r.logger.Warn("Not safe to become leader, attempting to join existing cluster",
|
||||
zap.Error(err))
|
||||
|
||||
// Find node with highest log index and join it
|
||||
if r.discoveryService != nil {
|
||||
targetNode := r.discoveryService.GetNodeWithHighestLogIndex()
|
||||
if targetNode != nil {
|
||||
r.logger.Info("Joining node with higher data",
|
||||
zap.String("target_node", targetNode.NodeID),
|
||||
zap.String("raft_address", targetNode.RaftAddress),
|
||||
zap.Uint64("their_index", targetNode.RaftLogIndex))
|
||||
return r.joinExistingCluster(ctx, targetNode.RaftAddress)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Safe to lead - attempt leadership
|
||||
leadershipErr := r.waitForLeadership(ctx)
|
||||
if leadershipErr == nil {
|
||||
r.logger.Info("Bootstrap node successfully established leadership")
|
||||
return nil
|
||||
}
|
||||
|
||||
r.logger.Warn("Initial leadership attempt failed, may need cluster recovery",
|
||||
zap.Error(leadershipErr))
|
||||
|
||||
// Try recovery if we have peers.json from discovery
|
||||
if r.discoveryService != nil {
|
||||
peersPath := filepath.Join(rqliteDataDir, "raft", "peers.json")
|
||||
if _, err := os.Stat(peersPath); err == nil {
|
||||
r.logger.Info("Attempting cluster recovery using peers.json",
|
||||
zap.String("peers_file", peersPath))
|
||||
|
||||
if recoveryErr := r.recoverCluster(ctx, peersPath); recoveryErr == nil {
|
||||
r.logger.Info("Cluster recovery successful, retrying leadership")
|
||||
leadershipErr = r.waitForLeadership(ctx)
|
||||
if leadershipErr == nil {
|
||||
r.logger.Info("Bootstrap node established leadership after recovery")
|
||||
return nil
|
||||
}
|
||||
} else {
|
||||
r.logger.Warn("Cluster recovery failed", zap.Error(recoveryErr))
|
||||
}
|
||||
}
|
||||
|
||||
// Check if we're stuck in configuration mismatch after recovery failed
|
||||
if leadershipErr != nil && r.isStuckInConfigurationMismatch() {
|
||||
r.logger.Warn("Detected persistent configuration mismatch, attempting automatic recovery")
|
||||
|
||||
// Verify it's safe to clear state (peers have higher log indexes)
|
||||
if r.isSafeToClearState(rqliteDataDir) {
|
||||
r.logger.Info("Clearing stale Raft state to resolve configuration mismatch")
|
||||
if err := r.clearRaftState(rqliteDataDir); err != nil {
|
||||
r.logger.Error("Failed to clear Raft state", zap.Error(err))
|
||||
} else {
|
||||
// Force write peers.json after clearing state
|
||||
if r.discoveryService != nil {
|
||||
r.logger.Info("Force writing peers.json after clearing state for configuration mismatch recovery")
|
||||
if err := r.discoveryService.ForceWritePeersJSON(); err != nil {
|
||||
r.logger.Error("Failed to force write peers.json", zap.Error(err))
|
||||
}
|
||||
// Update peersPath after force write
|
||||
peersPath = filepath.Join(rqliteDataDir, "raft", "peers.json")
|
||||
}
|
||||
// Restart RQLite with clean state
|
||||
r.logger.Info("Raft state cleared, restarting RQLite for clean rejoin")
|
||||
if recoveryErr := r.recoverCluster(ctx, peersPath); recoveryErr == nil {
|
||||
// Retry leadership after state clear
|
||||
leadershipErr = r.waitForLeadership(ctx)
|
||||
if leadershipErr == nil {
|
||||
r.logger.Info("Bootstrap node established leadership after state clear")
|
||||
return nil
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
r.logger.Warn("Configuration mismatch detected but clearing state is unsafe",
|
||||
zap.String("reason", "peers may not have more recent data"),
|
||||
zap.String("action", "manual intervention may be required"))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Final fallback: SQL availability
|
||||
r.logger.Warn("Leadership failed, trying SQL availability")
|
||||
// Wait for SQL to be available (indicates RQLite cluster is ready)
|
||||
sqlCtx := ctx
|
||||
if _, hasDeadline := ctx.Deadline(); !hasDeadline {
|
||||
var cancel context.CancelFunc
|
||||
sqlCtx, cancel = context.WithTimeout(context.Background(), 2*time.Minute)
|
||||
defer cancel()
|
||||
}
|
||||
|
||||
if err := r.waitForSQLAvailable(sqlCtx); err != nil {
|
||||
if r.cmd != nil && r.cmd.Process != nil {
|
||||
_ = r.cmd.Process.Kill()
|
||||
}
|
||||
return fmt.Errorf("RQLite SQL not available: %w", err)
|
||||
}
|
||||
return nil
|
||||
} else {
|
||||
// Joining node logic
|
||||
r.logger.Info("Waiting for RQLite SQL availability (leader discovery)")
|
||||
sqlCtx := ctx
|
||||
if _, hasDeadline := ctx.Deadline(); !hasDeadline {
|
||||
var cancel context.CancelFunc
|
||||
sqlCtx, cancel = context.WithTimeout(context.Background(), 5*time.Minute)
|
||||
defer cancel()
|
||||
}
|
||||
if err := r.waitForSQLAvailable(sqlCtx); err != nil {
|
||||
if r.cmd != nil && r.cmd.Process != nil {
|
||||
_ = r.cmd.Process.Kill()
|
||||
}
|
||||
return fmt.Errorf("RQLite SQL not available: %w", err)
|
||||
return fmt.Errorf("SQL not available for first node: %w", err)
|
||||
}
|
||||
|
||||
r.logger.Info("First node established successfully")
|
||||
return nil
|
||||
}
|
||||
|
||||
// Joining node - wait for SQL availability (indicates it joined the leader)
|
||||
r.logger.Info("Waiting for RQLite SQL availability (joining cluster)")
|
||||
sqlCtx := ctx
|
||||
if _, hasDeadline := ctx.Deadline(); !hasDeadline {
|
||||
var cancel context.CancelFunc
|
||||
sqlCtx, cancel = context.WithTimeout(context.Background(), 5*time.Minute)
|
||||
defer cancel()
|
||||
}
|
||||
|
||||
if err := r.waitForSQLAvailable(sqlCtx); err != nil {
|
||||
if r.cmd != nil && r.cmd.Process != nil {
|
||||
_ = r.cmd.Process.Kill()
|
||||
}
|
||||
return fmt.Errorf("RQLite SQL not available: %w", err)
|
||||
}
|
||||
|
||||
r.logger.Info("Node successfully joined cluster")
|
||||
return nil
|
||||
}
|
||||
|
||||
// hasExistingState returns true if the rqlite data directory already contains files or subdirectories.
|
||||
@ -526,16 +462,9 @@ func (r *RQLiteManager) waitForReady(ctx context.Context) error {
|
||||
url := fmt.Sprintf("http://localhost:%d/status", r.config.RQLitePort)
|
||||
client := &http.Client{Timeout: 2 * time.Second}
|
||||
|
||||
// Determine timeout based on whether this is a joining node
|
||||
// Joining nodes in recovery may take longer to open the store
|
||||
var maxAttempts int
|
||||
if r.config.RQLiteJoinAddress != "" {
|
||||
// Joining node: allow up to 180 seconds (3 minutes) for recovery
|
||||
maxAttempts = 180
|
||||
} else {
|
||||
// Bootstrap node: allow 30 seconds
|
||||
maxAttempts = 30
|
||||
}
|
||||
// All nodes may need time to open the store during recovery
|
||||
// Use consistent timeout for cluster consistency
|
||||
maxAttempts := 180 // 180 seconds (3 minutes) for all nodes
|
||||
|
||||
for i := 0; i < maxAttempts; i++ {
|
||||
select {
|
||||
@ -589,46 +518,6 @@ func (r *RQLiteManager) waitForReady(ctx context.Context) error {
|
||||
}
|
||||
|
||||
// waitForLeadership waits for RQLite to establish leadership (for bootstrap nodes)
|
||||
func (r *RQLiteManager) waitForLeadership(ctx context.Context) error {
|
||||
r.logger.Info("Waiting for RQLite to establish leadership...")
|
||||
|
||||
maxAttempts := 30
|
||||
attempt := 0
|
||||
backoffDelay := 500 * time.Millisecond
|
||||
maxBackoff := 5 * time.Second
|
||||
|
||||
for attempt < maxAttempts {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
default:
|
||||
}
|
||||
|
||||
// Try a simple query to check if leadership is established
|
||||
if r.connection != nil {
|
||||
_, err := r.connection.QueryOne("SELECT 1")
|
||||
if err == nil {
|
||||
r.logger.Info("RQLite leadership established")
|
||||
return nil
|
||||
}
|
||||
// Log every 5th attempt or on first attempt to reduce noise
|
||||
if attempt%5 == 0 || attempt == 0 {
|
||||
r.logger.Debug("Waiting for leadership", zap.Int("attempt", attempt+1), zap.Error(err))
|
||||
}
|
||||
}
|
||||
|
||||
// Exponential backoff with jitter
|
||||
time.Sleep(backoffDelay)
|
||||
backoffDelay = time.Duration(float64(backoffDelay) * 1.5)
|
||||
if backoffDelay > maxBackoff {
|
||||
backoffDelay = maxBackoff
|
||||
}
|
||||
attempt++
|
||||
}
|
||||
|
||||
return fmt.Errorf("RQLite failed to establish leadership within timeout")
|
||||
}
|
||||
|
||||
// GetConnection returns the RQLite connection
|
||||
func (r *RQLiteManager) GetConnection() *gorqlite.Connection {
|
||||
return r.connection
|
||||
@ -819,69 +708,6 @@ func (r *RQLiteManager) testJoinAddress(joinAddress string) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// canSafelyBecomeLeader checks if this node can safely become leader without causing data loss
|
||||
func (r *RQLiteManager) canSafelyBecomeLeader() (bool, error) {
|
||||
// Get our current Raft log index
|
||||
ourLogIndex := r.getRaftLogIndex()
|
||||
|
||||
// If no discovery service, assume it's safe (backward compatibility)
|
||||
if r.discoveryService == nil {
|
||||
r.logger.Debug("No discovery service, assuming safe to lead")
|
||||
return true, nil
|
||||
}
|
||||
|
||||
// Query discovery service for other nodes
|
||||
otherNodes := r.discoveryService.GetActivePeers()
|
||||
|
||||
if len(otherNodes) == 0 {
|
||||
// No other nodes - safe to bootstrap
|
||||
r.logger.Debug("No other nodes discovered, safe to lead",
|
||||
zap.Uint64("our_log_index", ourLogIndex))
|
||||
return true, nil
|
||||
}
|
||||
|
||||
// Check if any other node has higher log index
|
||||
for _, peer := range otherNodes {
|
||||
if peer.RaftLogIndex > ourLogIndex {
|
||||
// Other node has more data - we should join them
|
||||
return false, fmt.Errorf(
|
||||
"node %s has higher log index (%d > %d), should join as follower",
|
||||
peer.NodeID, peer.RaftLogIndex, ourLogIndex)
|
||||
}
|
||||
}
|
||||
|
||||
// We have most recent data or equal - safe to lead
|
||||
r.logger.Info("Safe to lead - we have most recent data",
|
||||
zap.Uint64("our_log_index", ourLogIndex),
|
||||
zap.Int("other_nodes_checked", len(otherNodes)))
|
||||
return true, nil
|
||||
}
|
||||
|
||||
// joinExistingCluster attempts to join an existing cluster as a follower
|
||||
func (r *RQLiteManager) joinExistingCluster(ctx context.Context, raftAddress string) error {
|
||||
r.logger.Info("Attempting to join existing cluster",
|
||||
zap.String("target_raft_address", raftAddress))
|
||||
|
||||
// Wait for the target to be reachable
|
||||
if err := r.waitForJoinTarget(ctx, raftAddress, 2*time.Minute); err != nil {
|
||||
return fmt.Errorf("join target not reachable: %w", err)
|
||||
}
|
||||
|
||||
// Wait for SQL availability (the target should have a leader)
|
||||
sqlCtx := ctx
|
||||
if _, hasDeadline := ctx.Deadline(); !hasDeadline {
|
||||
var cancel context.CancelFunc
|
||||
sqlCtx, cancel = context.WithTimeout(context.Background(), 5*time.Minute)
|
||||
defer cancel()
|
||||
}
|
||||
|
||||
if err := r.waitForSQLAvailable(sqlCtx); err != nil {
|
||||
return fmt.Errorf("failed to join cluster - SQL not available: %w", err)
|
||||
}
|
||||
|
||||
r.logger.Info("Successfully joined existing cluster")
|
||||
return nil
|
||||
}
|
||||
|
||||
// exponentialBackoff calculates exponential backoff duration with jitter
|
||||
func (r *RQLiteManager) exponentialBackoff(attempt int, baseDelay time.Duration, maxDelay time.Duration) time.Duration {
|
||||
@ -1038,111 +864,6 @@ func (r *RQLiteManager) clearRaftState(rqliteDataDir string) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// isStuckInConfigurationMismatch checks if we're stuck due to configuration mismatch
|
||||
// This detects both configuration mismatch AND split-brain scenarios
|
||||
func (r *RQLiteManager) isStuckInConfigurationMismatch() bool {
|
||||
// First check for split-brain (all followers, term 0, no peers)
|
||||
if r.isInSplitBrainState() {
|
||||
return true
|
||||
}
|
||||
|
||||
// Then check for traditional configuration mismatch
|
||||
status, err := r.getRQLiteStatus()
|
||||
if err != nil {
|
||||
r.logger.Debug("Cannot check Raft status for configuration mismatch", zap.Error(err))
|
||||
return false // Can't determine, don't clear
|
||||
}
|
||||
|
||||
raftState := strings.ToLower(status.Store.Raft.State)
|
||||
hasLeader := status.Store.Raft.LeaderAddr != ""
|
||||
|
||||
// Stuck if: no leader AND state is not "leader" or "follower"
|
||||
if !hasLeader && raftState != "leader" && raftState != "follower" {
|
||||
if r.allPeersAreStuck() {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
// allPeersAreStuck checks if all discovered peers also report no leader
|
||||
// This helps confirm we're in a cluster-wide configuration mismatch, not just a local issue
|
||||
func (r *RQLiteManager) allPeersAreStuck() bool {
|
||||
if r.discoveryService == nil {
|
||||
r.logger.Debug("No discovery service available to check peer status")
|
||||
return false
|
||||
}
|
||||
|
||||
peers := r.discoveryService.GetActivePeers()
|
||||
if len(peers) == 0 {
|
||||
r.logger.Debug("No peers discovered, might be network issue")
|
||||
return false // No peers discovered, might be network issue
|
||||
}
|
||||
|
||||
// Check if we can query peers and they all report no leader
|
||||
stuckCount := 0
|
||||
reachableCount := 0
|
||||
for _, peer := range peers {
|
||||
if r.peerHasLeader(peer.HTTPAddress) {
|
||||
// Peer has a leader, so we're not in cluster-wide mismatch
|
||||
return false
|
||||
}
|
||||
// Check if peer is at least reachable
|
||||
if r.isPeerReachable(peer.HTTPAddress) {
|
||||
reachableCount++
|
||||
stuckCount++
|
||||
}
|
||||
}
|
||||
|
||||
// If we have reachable peers and they're all stuck, we're likely in cluster-wide config mismatch
|
||||
if reachableCount > 0 && stuckCount == reachableCount {
|
||||
r.logger.Debug("All reachable peers are also stuck",
|
||||
zap.Int("reachable_peers", reachableCount),
|
||||
zap.Int("total_peers", len(peers)))
|
||||
return true
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
// peerHasLeader checks if a peer has a leader by querying its status endpoint
|
||||
func (r *RQLiteManager) peerHasLeader(httpAddr string) bool {
|
||||
url := fmt.Sprintf("http://%s/status", httpAddr)
|
||||
client := &http.Client{Timeout: 3 * time.Second}
|
||||
|
||||
resp, err := client.Get(url)
|
||||
if err != nil {
|
||||
return false // Can't reach peer
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return false
|
||||
}
|
||||
|
||||
var status RQLiteStatus
|
||||
if err := json.NewDecoder(resp.Body).Decode(&status); err != nil {
|
||||
return false
|
||||
}
|
||||
|
||||
// Peer has leader if leader address is set
|
||||
return status.Store.Raft.LeaderAddr != ""
|
||||
}
|
||||
|
||||
// isPeerReachable checks if a peer is at least responding to HTTP requests
|
||||
func (r *RQLiteManager) isPeerReachable(httpAddr string) bool {
|
||||
url := fmt.Sprintf("http://%s/status", httpAddr)
|
||||
client := &http.Client{Timeout: 3 * time.Second}
|
||||
|
||||
resp, err := client.Get(url)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
return resp.StatusCode == http.StatusOK
|
||||
}
|
||||
|
||||
// isInSplitBrainState detects if we're in a split-brain scenario where all nodes
|
||||
// are followers with no peers (each node thinks it's alone)
|
||||
@ -1215,6 +936,20 @@ func (r *RQLiteManager) isInSplitBrainState() bool {
|
||||
return false
|
||||
}
|
||||
|
||||
// isPeerReachable checks if a peer is at least responding to HTTP requests
|
||||
func (r *RQLiteManager) isPeerReachable(httpAddr string) bool {
|
||||
url := fmt.Sprintf("http://%s/status", httpAddr)
|
||||
client := &http.Client{Timeout: 3 * time.Second}
|
||||
|
||||
resp, err := client.Get(url)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
return resp.StatusCode == http.StatusOK
|
||||
}
|
||||
|
||||
// getPeerRQLiteStatus queries a peer's status endpoint
|
||||
func (r *RQLiteManager) getPeerRQLiteStatus(httpAddr string) (*RQLiteStatus, error) {
|
||||
url := fmt.Sprintf("http://%s/status", httpAddr)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user