mirror of
https://github.com/DeBrosOfficial/network.git
synced 2025-12-12 22:58:49 +00:00
works
This commit is contained in:
parent
f991d55676
commit
0b24c66d56
15
CHANGELOG.md
15
CHANGELOG.md
@ -13,6 +13,21 @@ The format is based on [Keep a Changelog][keepachangelog] and adheres to [Semant
|
||||
### Deprecated
|
||||
|
||||
### Fixed
|
||||
## [0.60.1] - 2025-11-09
|
||||
|
||||
### Added
|
||||
- Improved IPFS Cluster startup logic in development environment to ensure proper peer discovery and configuration.
|
||||
|
||||
### Changed
|
||||
- Refactored IPFS Cluster initialization in the development environment to use a multi-phase startup (bootstrap first, then followers) and explicitly clean stale cluster state (pebble, peerstore) before initialization.
|
||||
|
||||
### Deprecated
|
||||
|
||||
### Removed
|
||||
|
||||
### Fixed
|
||||
- Fixed an issue where IPFS Cluster nodes in the development environment might fail to join due to incorrect bootstrap configuration or stale state.
|
||||
|
||||
## [0.60.0] - 2025-11-09
|
||||
|
||||
### Added
|
||||
|
||||
2
Makefile
2
Makefile
@ -21,7 +21,7 @@ test-e2e:
|
||||
|
||||
.PHONY: build clean test run-node run-node2 run-node3 run-example deps tidy fmt vet lint clear-ports install-hooks kill
|
||||
|
||||
VERSION := 0.60.0
|
||||
VERSION := 0.60.1
|
||||
COMMIT ?= $(shell git rev-parse --short HEAD 2>/dev/null || echo unknown)
|
||||
DATE ?= $(shell date -u +%Y-%m-%dT%H:%M:%SZ)
|
||||
LDFLAGS := -X 'main.version=$(VERSION)' -X 'main.commit=$(COMMIT)' -X 'main.date=$(DATE)'
|
||||
|
||||
@ -38,7 +38,7 @@ func (pm *ProcessManager) IPFSHealthCheck(ctx context.Context, nodes []ipfsNodeI
|
||||
if strings.TrimSpace(line) != "" {
|
||||
peerCount++
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if peerCount < 2 {
|
||||
result.Details += fmt.Sprintf("%s: only %d peers (want 2+); ", node.name, peerCount)
|
||||
@ -99,7 +99,7 @@ func (pm *ProcessManager) checkRQLiteNode(ctx context.Context, name string, http
|
||||
if err := json.NewDecoder(resp.Body).Decode(&status); err != nil {
|
||||
result.Details = fmt.Sprintf("decode error: %v", err)
|
||||
return result
|
||||
}
|
||||
}
|
||||
|
||||
// Check the store.raft structure (RQLite 8 format)
|
||||
store, ok := status["store"].(map[string]interface{})
|
||||
|
||||
@ -418,14 +418,108 @@ func (pm *ProcessManager) startIPFSCluster(ctx context.Context) error {
|
||||
}
|
||||
}
|
||||
|
||||
for _, node := range nodes {
|
||||
serviceJSON := filepath.Join(node.clusterPath, "service.json")
|
||||
if _, err := os.Stat(serviceJSON); os.IsNotExist(err) {
|
||||
// Read cluster secret to ensure all nodes use the same PSK
|
||||
secretPath := filepath.Join(pm.debrosDir, "cluster-secret")
|
||||
clusterSecret, err := os.ReadFile(secretPath)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to read cluster secret: %w", err)
|
||||
}
|
||||
clusterSecretHex := strings.TrimSpace(string(clusterSecret))
|
||||
|
||||
// Phase 1: Initialize and start bootstrap IPFS Cluster, then read its identity
|
||||
bootstrapMultiaddr := ""
|
||||
{
|
||||
node := nodes[0] // bootstrap
|
||||
|
||||
// Always clean stale cluster state to ensure fresh initialization with correct secret
|
||||
if err := pm.cleanClusterState(node.clusterPath); err != nil {
|
||||
fmt.Fprintf(pm.logWriter, " Warning: failed to clean cluster state for %s: %v\n", node.name, err)
|
||||
}
|
||||
|
||||
os.MkdirAll(node.clusterPath, 0755)
|
||||
fmt.Fprintf(pm.logWriter, " Initializing IPFS Cluster (%s)...\n", node.name)
|
||||
cmd := exec.CommandContext(ctx, "ipfs-cluster-service", "init", "--force")
|
||||
cmd.Env = append(os.Environ(),
|
||||
fmt.Sprintf("IPFS_CLUSTER_PATH=%s", node.clusterPath),
|
||||
fmt.Sprintf("CLUSTER_SECRET=%s", clusterSecretHex),
|
||||
)
|
||||
if output, err := cmd.CombinedOutput(); err != nil {
|
||||
fmt.Fprintf(pm.logWriter, " Warning: ipfs-cluster-service init failed: %v (output: %s)\n", err, string(output))
|
||||
}
|
||||
|
||||
// Ensure correct ports in service.json BEFORE starting daemon
|
||||
// This is critical: it sets the cluster listen port to clusterPort, not the default
|
||||
if err := pm.ensureIPFSClusterPorts(node.clusterPath, node.restAPIPort, node.clusterPort); err != nil {
|
||||
fmt.Fprintf(pm.logWriter, " Warning: failed to update IPFS Cluster config for %s: %v\n", node.name, err)
|
||||
}
|
||||
|
||||
// Verify the config was written correctly (debug: read it back)
|
||||
serviceJSONPath := filepath.Join(node.clusterPath, "service.json")
|
||||
if data, err := os.ReadFile(serviceJSONPath); err == nil {
|
||||
var verifyConfig map[string]interface{}
|
||||
if err := json.Unmarshal(data, &verifyConfig); err == nil {
|
||||
if cluster, ok := verifyConfig["cluster"].(map[string]interface{}); ok {
|
||||
if listenAddrs, ok := cluster["listen_multiaddress"].([]interface{}); ok {
|
||||
fmt.Fprintf(pm.logWriter, " Config verified: %s cluster listening on %v\n", node.name, listenAddrs)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Start bootstrap cluster service
|
||||
pidPath := filepath.Join(pm.pidsDir, fmt.Sprintf("ipfs-cluster-%s.pid", node.name))
|
||||
logPath := filepath.Join(pm.debrosDir, "logs", fmt.Sprintf("ipfs-cluster-%s.log", node.name))
|
||||
|
||||
cmd = exec.CommandContext(ctx, "ipfs-cluster-service", "daemon")
|
||||
cmd.Env = append(os.Environ(), fmt.Sprintf("IPFS_CLUSTER_PATH=%s", node.clusterPath))
|
||||
cmd.Run()
|
||||
logFile, _ := os.Create(logPath)
|
||||
cmd.Stdout = logFile
|
||||
cmd.Stderr = logFile
|
||||
|
||||
if err := cmd.Start(); err != nil {
|
||||
fmt.Fprintf(pm.logWriter, " ⚠️ Failed to start ipfs-cluster-%s: %v\n", node.name, err)
|
||||
return err
|
||||
}
|
||||
|
||||
os.WriteFile(pidPath, []byte(fmt.Sprintf("%d", cmd.Process.Pid)), 0644)
|
||||
fmt.Fprintf(pm.logWriter, "✓ IPFS Cluster (%s) started (PID: %d, API: %d)\n", node.name, cmd.Process.Pid, node.restAPIPort)
|
||||
|
||||
// Wait for bootstrap to be ready and read its identity
|
||||
if err := pm.waitClusterReady(ctx, node.name, node.restAPIPort); err != nil {
|
||||
fmt.Fprintf(pm.logWriter, " Warning: IPFS Cluster %s did not become ready: %v\n", node.name, err)
|
||||
}
|
||||
|
||||
// Add a brief delay to allow identity.json to be written
|
||||
time.Sleep(500 * time.Millisecond)
|
||||
|
||||
// Read bootstrap peer ID for follower nodes to join
|
||||
peerID, err := pm.waitForClusterPeerID(ctx, filepath.Join(node.clusterPath, "identity.json"))
|
||||
if err != nil {
|
||||
fmt.Fprintf(pm.logWriter, " Warning: failed to read bootstrap peer ID: %v\n", err)
|
||||
} else {
|
||||
bootstrapMultiaddr = fmt.Sprintf("/ip4/127.0.0.1/tcp/%d/p2p/%s", node.clusterPort, peerID)
|
||||
fmt.Fprintf(pm.logWriter, " Bootstrap multiaddress: %s\n", bootstrapMultiaddr)
|
||||
}
|
||||
}
|
||||
|
||||
// Phase 2: Initialize and start follower IPFS Cluster nodes with bootstrap flag
|
||||
for i := 1; i < len(nodes); i++ {
|
||||
node := nodes[i]
|
||||
|
||||
// Always clean stale cluster state to ensure fresh initialization with correct secret
|
||||
if err := pm.cleanClusterState(node.clusterPath); err != nil {
|
||||
fmt.Fprintf(pm.logWriter, " Warning: failed to clean cluster state for %s: %v\n", node.name, err)
|
||||
}
|
||||
|
||||
os.MkdirAll(node.clusterPath, 0755)
|
||||
fmt.Fprintf(pm.logWriter, " Initializing IPFS Cluster (%s)...\n", node.name)
|
||||
cmd := exec.CommandContext(ctx, "ipfs-cluster-service", "init", "--force")
|
||||
cmd.Env = append(os.Environ(),
|
||||
fmt.Sprintf("IPFS_CLUSTER_PATH=%s", node.clusterPath),
|
||||
fmt.Sprintf("CLUSTER_SECRET=%s", clusterSecretHex),
|
||||
)
|
||||
if output, err := cmd.CombinedOutput(); err != nil {
|
||||
fmt.Fprintf(pm.logWriter, " Warning: ipfs-cluster-service init failed for %s: %v (output: %s)\n", node.name, err, string(output))
|
||||
}
|
||||
|
||||
// Ensure correct ports in service.json BEFORE starting daemon
|
||||
@ -433,11 +527,29 @@ func (pm *ProcessManager) startIPFSCluster(ctx context.Context) error {
|
||||
fmt.Fprintf(pm.logWriter, " Warning: failed to update IPFS Cluster config for %s: %v\n", node.name, err)
|
||||
}
|
||||
|
||||
// Start cluster service
|
||||
// Verify the config was written correctly (debug: read it back)
|
||||
serviceJSONPath := filepath.Join(node.clusterPath, "service.json")
|
||||
if data, err := os.ReadFile(serviceJSONPath); err == nil {
|
||||
var verifyConfig map[string]interface{}
|
||||
if err := json.Unmarshal(data, &verifyConfig); err == nil {
|
||||
if cluster, ok := verifyConfig["cluster"].(map[string]interface{}); ok {
|
||||
if listenAddrs, ok := cluster["listen_multiaddress"].([]interface{}); ok {
|
||||
fmt.Fprintf(pm.logWriter, " Config verified: %s cluster listening on %v\n", node.name, listenAddrs)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Start follower cluster service with bootstrap flag
|
||||
pidPath := filepath.Join(pm.pidsDir, fmt.Sprintf("ipfs-cluster-%s.pid", node.name))
|
||||
logPath := filepath.Join(pm.debrosDir, "logs", fmt.Sprintf("ipfs-cluster-%s.log", node.name))
|
||||
|
||||
cmd := exec.CommandContext(ctx, "ipfs-cluster-service", "daemon")
|
||||
args := []string{"daemon"}
|
||||
if bootstrapMultiaddr != "" {
|
||||
args = append(args, "--bootstrap", bootstrapMultiaddr)
|
||||
}
|
||||
|
||||
cmd = exec.CommandContext(ctx, "ipfs-cluster-service", args...)
|
||||
cmd.Env = append(os.Environ(), fmt.Sprintf("IPFS_CLUSTER_PATH=%s", node.clusterPath))
|
||||
logFile, _ := os.Create(logPath)
|
||||
cmd.Stdout = logFile
|
||||
@ -450,12 +562,145 @@ func (pm *ProcessManager) startIPFSCluster(ctx context.Context) error {
|
||||
|
||||
os.WriteFile(pidPath, []byte(fmt.Sprintf("%d", cmd.Process.Pid)), 0644)
|
||||
fmt.Fprintf(pm.logWriter, "✓ IPFS Cluster (%s) started (PID: %d, API: %d)\n", node.name, cmd.Process.Pid, node.restAPIPort)
|
||||
|
||||
// Wait for follower node to connect to the bootstrap peer
|
||||
if err := pm.waitClusterReady(ctx, node.name, node.restAPIPort); err != nil {
|
||||
fmt.Fprintf(pm.logWriter, " Warning: IPFS Cluster %s did not become ready: %v\n", node.name, err)
|
||||
}
|
||||
}
|
||||
|
||||
// Phase 3: Wait for all cluster peers to discover each other
|
||||
fmt.Fprintf(pm.logWriter, " Waiting for IPFS Cluster peers to form...\n")
|
||||
if err := pm.waitClusterFormed(ctx, nodes[0].restAPIPort); err != nil {
|
||||
fmt.Fprintf(pm.logWriter, " Warning: IPFS Cluster did not form fully: %v\n", err)
|
||||
}
|
||||
|
||||
time.Sleep(1 * time.Second)
|
||||
return nil
|
||||
}
|
||||
|
||||
// waitForClusterPeerID polls the identity.json file until it appears and extracts the peer ID
|
||||
func (pm *ProcessManager) waitForClusterPeerID(ctx context.Context, identityPath string) (string, error) {
|
||||
maxRetries := 30
|
||||
retryInterval := 500 * time.Millisecond
|
||||
|
||||
for attempt := 0; attempt < maxRetries; attempt++ {
|
||||
data, err := os.ReadFile(identityPath)
|
||||
if err == nil {
|
||||
var identity map[string]interface{}
|
||||
if err := json.Unmarshal(data, &identity); err == nil {
|
||||
if id, ok := identity["id"].(string); ok {
|
||||
return id, nil
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
select {
|
||||
case <-time.After(retryInterval):
|
||||
continue
|
||||
case <-ctx.Done():
|
||||
return "", ctx.Err()
|
||||
}
|
||||
}
|
||||
|
||||
return "", fmt.Errorf("could not read cluster peer ID after %d seconds", (maxRetries * int(retryInterval.Milliseconds()) / 1000))
|
||||
}
|
||||
|
||||
// waitClusterReady polls the cluster REST API until it's ready
|
||||
func (pm *ProcessManager) waitClusterReady(ctx context.Context, name string, restAPIPort int) error {
|
||||
maxRetries := 30
|
||||
retryInterval := 500 * time.Millisecond
|
||||
|
||||
for attempt := 0; attempt < maxRetries; attempt++ {
|
||||
httpURL := fmt.Sprintf("http://127.0.0.1:%d/peers", restAPIPort)
|
||||
resp, err := http.Get(httpURL)
|
||||
if err == nil && resp.StatusCode == 200 {
|
||||
resp.Body.Close()
|
||||
return nil
|
||||
}
|
||||
if resp != nil {
|
||||
resp.Body.Close()
|
||||
}
|
||||
|
||||
select {
|
||||
case <-time.After(retryInterval):
|
||||
continue
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
}
|
||||
}
|
||||
|
||||
return fmt.Errorf("IPFS Cluster %s did not become ready after %d seconds", name, (maxRetries * int(retryInterval.Seconds())))
|
||||
}
|
||||
|
||||
// waitClusterFormed waits for all cluster peers to be visible from the bootstrap node
|
||||
func (pm *ProcessManager) waitClusterFormed(ctx context.Context, bootstrapRestAPIPort int) error {
|
||||
maxRetries := 30
|
||||
retryInterval := 1 * time.Second
|
||||
requiredPeers := 3 // bootstrap, node2, node3
|
||||
|
||||
for attempt := 0; attempt < maxRetries; attempt++ {
|
||||
httpURL := fmt.Sprintf("http://127.0.0.1:%d/peers", bootstrapRestAPIPort)
|
||||
resp, err := http.Get(httpURL)
|
||||
if err == nil && resp.StatusCode == 200 {
|
||||
var peers []interface{}
|
||||
if err := json.NewDecoder(resp.Body).Decode(&peers); err == nil {
|
||||
resp.Body.Close()
|
||||
if len(peers) >= requiredPeers {
|
||||
return nil // All peers have formed
|
||||
}
|
||||
} else {
|
||||
resp.Body.Close()
|
||||
}
|
||||
}
|
||||
if resp != nil {
|
||||
resp.Body.Close()
|
||||
}
|
||||
|
||||
select {
|
||||
case <-time.After(retryInterval):
|
||||
continue
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
}
|
||||
}
|
||||
|
||||
return fmt.Errorf("IPFS Cluster did not form fully after %d seconds", (maxRetries * int(retryInterval.Seconds())))
|
||||
}
|
||||
|
||||
// cleanClusterState removes stale cluster state files to ensure fresh initialization
|
||||
// This prevents PSK (private network key) mismatches when cluster secret changes
|
||||
func (pm *ProcessManager) cleanClusterState(clusterPath string) error {
|
||||
// Remove pebble datastore (contains persisted PSK state)
|
||||
pebblePath := filepath.Join(clusterPath, "pebble")
|
||||
if err := os.RemoveAll(pebblePath); err != nil && !os.IsNotExist(err) {
|
||||
return fmt.Errorf("failed to remove pebble directory: %w", err)
|
||||
}
|
||||
|
||||
// Remove peerstore (contains peer addresses and metadata)
|
||||
peerstorePath := filepath.Join(clusterPath, "peerstore")
|
||||
if err := os.Remove(peerstorePath); err != nil && !os.IsNotExist(err) {
|
||||
return fmt.Errorf("failed to remove peerstore: %w", err)
|
||||
}
|
||||
|
||||
// Remove service.json (will be regenerated with correct ports and secret)
|
||||
serviceJSONPath := filepath.Join(clusterPath, "service.json")
|
||||
if err := os.Remove(serviceJSONPath); err != nil && !os.IsNotExist(err) {
|
||||
return fmt.Errorf("failed to remove service.json: %w", err)
|
||||
}
|
||||
|
||||
// Remove cluster.lock if it exists (from previous run)
|
||||
lockPath := filepath.Join(clusterPath, "cluster.lock")
|
||||
if err := os.Remove(lockPath); err != nil && !os.IsNotExist(err) {
|
||||
return fmt.Errorf("failed to remove cluster.lock: %w", err)
|
||||
}
|
||||
|
||||
// Note: We keep identity.json as it's tied to the node's peer ID
|
||||
// The secret will be updated via CLUSTER_SECRET env var during init
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// ensureIPFSClusterPorts updates service.json with correct per-node ports and IPFS connector settings
|
||||
func (pm *ProcessManager) ensureIPFSClusterPorts(clusterPath string, restAPIPort int, clusterPort int) error {
|
||||
serviceJSONPath := filepath.Join(clusterPath, "service.json")
|
||||
@ -502,10 +747,13 @@ func (pm *ProcessManager) ensureIPFSClusterPorts(clusterPath string, restAPIPort
|
||||
}
|
||||
}
|
||||
|
||||
// Update cluster listen multiaddress
|
||||
// Update cluster listen multiaddress to match the correct port
|
||||
// Replace all old listen addresses with new ones for the correct port
|
||||
if cluster, ok := config["cluster"].(map[string]interface{}); ok {
|
||||
listenAddrs := make([]string, 0)
|
||||
listenAddrs = append(listenAddrs, fmt.Sprintf("/ip4/0.0.0.0/tcp/%d", clusterPort))
|
||||
listenAddrs := []string{
|
||||
fmt.Sprintf("/ip4/0.0.0.0/tcp/%d", clusterPort),
|
||||
fmt.Sprintf("/ip4/127.0.0.1/tcp/%d", clusterPort),
|
||||
}
|
||||
cluster["listen_multiaddress"] = listenAddrs
|
||||
}
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user