diff --git a/.gitignore b/.gitignore index 7207c6b..6137656 100644 --- a/.gitignore +++ b/.gitignore @@ -105,4 +105,6 @@ website/ terms-agreement cli -./inspector \ No newline at end of file +./inspector + +results/ \ No newline at end of file diff --git a/inspector b/inspector deleted file mode 100755 index 489d340..0000000 Binary files a/inspector and /dev/null differ diff --git a/pkg/cli/inspect_command.go b/pkg/cli/inspect_command.go index e68a939..9fedf66 100644 --- a/pkg/cli/inspect_command.go +++ b/pkg/cli/inspect_command.go @@ -51,10 +51,12 @@ func HandleInspectCommand(args []string) { configPath := fs.String("config", "scripts/remote-nodes.conf", "Path to remote-nodes.conf") env := fs.String("env", "", "Environment to inspect (devnet, testnet)") - subsystem := fs.String("subsystem", "all", "Subsystem to inspect (rqlite,olric,ipfs,dns,wg,system,network,all)") + subsystem := fs.String("subsystem", "all", "Subsystem to inspect (rqlite,olric,ipfs,dns,wg,system,network,anyone,all)") format := fs.String("format", "table", "Output format (table, json)") timeout := fs.Duration("timeout", 30*time.Second, "SSH command timeout") verbose := fs.Bool("verbose", false, "Verbose output") + // Output flags + outputDir := fs.String("output", "", "Save results to directory as markdown (e.g., ./results)") // AI flags aiEnabled := fs.Bool("ai", false, "Enable AI analysis of failures") aiModel := fs.String("model", "moonshotai/kimi-k2.5", "OpenRouter model for AI analysis") @@ -70,6 +72,7 @@ func HandleInspectCommand(args []string) { fmt.Fprintf(os.Stderr, " orama inspect --env devnet --subsystem rqlite\n") fmt.Fprintf(os.Stderr, " orama inspect --env devnet --ai\n") fmt.Fprintf(os.Stderr, " orama inspect --env devnet --ai --model openai/gpt-4o\n") + fmt.Fprintf(os.Stderr, " orama inspect --env devnet --ai --output ./results\n") } if err := fs.Parse(args); err != nil { @@ -136,18 +139,31 @@ func HandleInspectCommand(args []string) { } // Phase 4: AI Analysis (if enabled and there are failures or warnings) + var analysis *inspector.AnalysisResult if *aiEnabled { issues := results.FailuresAndWarnings() if len(issues) == 0 { fmt.Printf("\nAll checks passed — no AI analysis needed.\n") + } else if *outputDir != "" { + // Per-group AI analysis for file output + groups := inspector.GroupFailures(results) + fmt.Printf("\nAnalyzing %d unique issues with %s...\n", len(groups), *aiModel) + var err error + analysis, err = inspector.AnalyzeGroups(groups, results, data, *aiModel, *aiAPIKey) + if err != nil { + fmt.Fprintf(os.Stderr, "\nAI analysis failed: %v\n", err) + } else { + inspector.PrintAnalysis(analysis, os.Stdout) + } } else { - // Count affected subsystems + // Per-subsystem AI analysis for terminal output subs := map[string]bool{} for _, c := range issues { subs[c.Subsystem] = true } fmt.Printf("\nAnalyzing %d issues across %d subsystems with %s...\n", len(issues), len(subs), *aiModel) - analysis, err := inspector.Analyze(results, data, *aiModel, *aiAPIKey) + var err error + analysis, err = inspector.Analyze(results, data, *aiModel, *aiAPIKey) if err != nil { fmt.Fprintf(os.Stderr, "\nAI analysis failed: %v\n", err) } else { @@ -156,6 +172,16 @@ func HandleInspectCommand(args []string) { } } + // Phase 5: Write results to disk (if --output is set) + if *outputDir != "" { + outPath, err := inspector.WriteResults(*outputDir, *env, results, data, analysis) + if err != nil { + fmt.Fprintf(os.Stderr, "\nError writing results: %v\n", err) + } else { + fmt.Printf("\nResults saved to %s\n", outPath) + } + } + // Exit with non-zero if any failures if failures := results.Failures(); len(failures) > 0 { os.Exit(1) diff --git a/pkg/inspector/analyzer.go b/pkg/inspector/analyzer.go index 9d86bfd..e5140cd 100644 --- a/pkg/inspector/analyzer.go +++ b/pkg/inspector/analyzer.go @@ -52,9 +52,10 @@ Step-by-step commands to resolve. Include actual node IPs/names from the data wh ### Prevention What could prevent this in the future? (omit if not applicable)` -// SubsystemAnalysis holds the AI analysis for a single subsystem. +// SubsystemAnalysis holds the AI analysis for a single subsystem or failure group. type SubsystemAnalysis struct { Subsystem string + GroupID string // e.g. "anyone.bootstrapped" — empty when analyzing whole subsystem Analysis string Duration time.Duration Error error @@ -149,6 +150,125 @@ func Analyze(results *Results, data *ClusterData, model, apiKey string) (*Analys }, nil } +// AnalyzeGroups sends each failure group to OpenRouter for focused AI analysis. +// Unlike Analyze which sends one call per subsystem, this sends one call per unique +// failure pattern, producing more focused and actionable results. +func AnalyzeGroups(groups []FailureGroup, results *Results, data *ClusterData, model, apiKey string) (*AnalysisResult, error) { + if apiKey == "" { + apiKey = os.Getenv("OPENROUTER_API_KEY") + } + if apiKey == "" { + return nil, fmt.Errorf("no API key: set --api-key or OPENROUTER_API_KEY env") + } + + if len(groups) == 0 { + return &AnalysisResult{Model: model}, nil + } + + // Build shared context + issuesBySubsystem := map[string][]CheckResult{} + for _, c := range results.FailuresAndWarnings() { + issuesBySubsystem[c.Subsystem] = append(issuesBySubsystem[c.Subsystem], c) + } + healthySummary := buildHealthySummary(results, issuesBySubsystem) + collectionErrors := buildCollectionErrors(data) + + start := time.Now() + var mu sync.Mutex + var wg sync.WaitGroup + var analyses []SubsystemAnalysis + + for _, g := range groups { + wg.Add(1) + go func(group FailureGroup) { + defer wg.Done() + + prompt := buildGroupPrompt(group, data, healthySummary, collectionErrors) + subStart := time.Now() + response, err := callOpenRouter(model, apiKey, prompt) + + sa := SubsystemAnalysis{ + Subsystem: group.Subsystem, + GroupID: group.ID, + Duration: time.Since(subStart), + } + if err != nil { + sa.Error = err + } else { + sa.Analysis = response + } + + mu.Lock() + analyses = append(analyses, sa) + mu.Unlock() + }(g) + } + wg.Wait() + + // Sort by subsystem then group ID for consistent output + sort.Slice(analyses, func(i, j int) bool { + if analyses[i].Subsystem != analyses[j].Subsystem { + return analyses[i].Subsystem < analyses[j].Subsystem + } + return analyses[i].GroupID < analyses[j].GroupID + }) + + return &AnalysisResult{ + Model: model, + Analyses: analyses, + Duration: time.Since(start), + }, nil +} + +func buildGroupPrompt(group FailureGroup, data *ClusterData, healthySummary, collectionErrors string) string { + var b strings.Builder + + icon := "FAILURE" + if group.Status == StatusWarn { + icon = "WARNING" + } + + b.WriteString(fmt.Sprintf("## %s: %s\n\n", icon, group.Name)) + b.WriteString(fmt.Sprintf("**Check ID:** %s \n", group.ID)) + b.WriteString(fmt.Sprintf("**Severity:** %s \n", group.Severity)) + b.WriteString(fmt.Sprintf("**Nodes affected:** %d \n\n", len(group.Nodes))) + + b.WriteString("**Affected nodes:**\n") + for _, n := range group.Nodes { + b.WriteString(fmt.Sprintf("- %s\n", n)) + } + b.WriteString("\n") + + b.WriteString("**Error messages:**\n") + for _, m := range group.Messages { + b.WriteString(fmt.Sprintf("- %s\n", m)) + } + b.WriteString("\n") + + // Subsystem raw data + contextData := buildSubsystemContext(group.Subsystem, data) + if contextData != "" { + b.WriteString(fmt.Sprintf("## %s Raw Data (all nodes)\n", strings.ToUpper(group.Subsystem))) + b.WriteString(contextData) + b.WriteString("\n") + } + + if healthySummary != "" { + b.WriteString("## Healthy Subsystems\n") + b.WriteString(healthySummary) + b.WriteString("\n") + } + + if collectionErrors != "" { + b.WriteString("## Collection Errors\n") + b.WriteString(collectionErrors) + b.WriteString("\n") + } + + b.WriteString(fmt.Sprintf("\nAnalyze this specific %s issue. Be concise — focus on this one problem.\n", group.Subsystem)) + return b.String() +} + func buildClusterOverview(data *ClusterData, results *Results) string { var b strings.Builder b.WriteString(fmt.Sprintf("Nodes: %d\n", len(data.Nodes))) @@ -286,6 +406,8 @@ func buildSubsystemContext(subsystem string, data *ClusterData) string { return buildNetworkContext(data) case "namespace": return buildNamespaceContext(data) + case "anyone": + return buildAnyoneContext(data) default: return "" } @@ -486,6 +608,40 @@ func buildNamespaceContext(data *ClusterData) string { return b.String() } +func buildAnyoneContext(data *ClusterData) string { + var b strings.Builder + for host, nd := range data.Nodes { + if nd.Anyone == nil { + continue + } + a := nd.Anyone + if !a.RelayActive && !a.ClientActive { + continue + } + b.WriteString(fmt.Sprintf("### %s\n", host)) + b.WriteString(fmt.Sprintf(" relay=%v client=%v orport=%v socks=%v control=%v\n", + a.RelayActive, a.ClientActive, a.ORPortListening, a.SocksListening, a.ControlListening)) + if a.RelayActive { + b.WriteString(fmt.Sprintf(" bootstrap=%d%% fingerprint=%s nickname=%s\n", + a.BootstrapPct, a.Fingerprint, a.Nickname)) + } + if len(a.ORPortReachable) > 0 { + var unreachable []string + for h, ok := range a.ORPortReachable { + if !ok { + unreachable = append(unreachable, h) + } + } + if len(unreachable) > 0 { + b.WriteString(fmt.Sprintf(" orport_unreachable: %s\n", strings.Join(unreachable, ", "))) + } else { + b.WriteString(fmt.Sprintf(" orport: all %d peers reachable\n", len(a.ORPortReachable))) + } + } + } + return b.String() +} + // OpenRouter API types (OpenAI-compatible) type openRouterRequest struct { @@ -531,7 +687,7 @@ func callOpenRouter(model, apiKey, prompt string) (string, error) { req.Header.Set("Content-Type", "application/json") req.Header.Set("Authorization", "Bearer "+apiKey) - client := &http.Client{Timeout: 120 * time.Second} + client := &http.Client{Timeout: 180 * time.Second} resp, err := client.Do(req) if err != nil { return "", fmt.Errorf("HTTP request: %w", err) diff --git a/pkg/inspector/checks/anyone.go b/pkg/inspector/checks/anyone.go new file mode 100644 index 0000000..6493513 --- /dev/null +++ b/pkg/inspector/checks/anyone.go @@ -0,0 +1,170 @@ +package checks + +import ( + "fmt" + + "github.com/DeBrosOfficial/network/pkg/inspector" +) + +func init() { + inspector.RegisterChecker("anyone", CheckAnyone) +} + +const anyoneSub = "anyone" + +// CheckAnyone runs all Anyone relay/client health checks. +func CheckAnyone(data *inspector.ClusterData) []inspector.CheckResult { + var results []inspector.CheckResult + + for _, nd := range data.Nodes { + if nd.Anyone == nil { + continue + } + results = append(results, checkAnyonePerNode(nd)...) + } + + results = append(results, checkAnyoneCrossNode(data)...) + + return results +} + +func checkAnyonePerNode(nd *inspector.NodeData) []inspector.CheckResult { + var r []inspector.CheckResult + a := nd.Anyone + node := nd.Node.Name() + + // If neither service is active, skip all checks for this node + if !a.RelayActive && !a.ClientActive { + return r + } + + // --- Relay checks --- + if a.RelayActive { + r = append(r, inspector.Pass("anyone.relay_active", "Anyone relay service active", anyoneSub, node, + "debros-anyone-relay is active", inspector.High)) + + // ORPort listening + if a.ORPortListening { + r = append(r, inspector.Pass("anyone.orport_listening", "ORPort 9001 listening", anyoneSub, node, + "port 9001 bound", inspector.High)) + } else { + r = append(r, inspector.Fail("anyone.orport_listening", "ORPort 9001 listening", anyoneSub, node, + "port 9001 NOT bound", inspector.High)) + } + + // Control port + if a.ControlListening { + r = append(r, inspector.Pass("anyone.control_listening", "Control port 9051 listening", anyoneSub, node, + "port 9051 bound", inspector.Low)) + } else { + r = append(r, inspector.Warn("anyone.control_listening", "Control port 9051 listening", anyoneSub, node, + "port 9051 NOT bound (monitoring unavailable)", inspector.Low)) + } + + // Bootstrap status + if a.Bootstrapped { + r = append(r, inspector.Pass("anyone.bootstrapped", "Relay bootstrapped", anyoneSub, node, + fmt.Sprintf("bootstrap=%d%%", a.BootstrapPct), inspector.High)) + } else if a.BootstrapPct > 0 { + r = append(r, inspector.Warn("anyone.bootstrapped", "Relay bootstrapped", anyoneSub, node, + fmt.Sprintf("bootstrap=%d%% (still connecting)", a.BootstrapPct), inspector.High)) + } else { + r = append(r, inspector.Fail("anyone.bootstrapped", "Relay bootstrapped", anyoneSub, node, + "bootstrap=0% (not started or log missing)", inspector.High)) + } + + // Fingerprint present + if a.Fingerprint != "" { + r = append(r, inspector.Pass("anyone.fingerprint", "Relay has fingerprint", anyoneSub, node, + fmt.Sprintf("fingerprint=%s", a.Fingerprint), inspector.Medium)) + } else { + r = append(r, inspector.Warn("anyone.fingerprint", "Relay has fingerprint", anyoneSub, node, + "no fingerprint found (relay may not have generated keys yet)", inspector.Medium)) + } + + // Nickname configured + if a.Nickname != "" { + r = append(r, inspector.Pass("anyone.nickname", "Relay nickname configured", anyoneSub, node, + fmt.Sprintf("nickname=%s", a.Nickname), inspector.Low)) + } else { + r = append(r, inspector.Warn("anyone.nickname", "Relay nickname configured", anyoneSub, node, + "no nickname in /etc/anon/anonrc", inspector.Low)) + } + } + + // --- Client checks --- + if a.ClientActive { + r = append(r, inspector.Pass("anyone.client_active", "Anyone client service active", anyoneSub, node, + "debros-anyone-client is active", inspector.High)) + + // SOCKS5 port listening + if a.SocksListening { + r = append(r, inspector.Pass("anyone.socks_listening", "SOCKS5 port 9050 listening", anyoneSub, node, + "port 9050 bound", inspector.High)) + } else { + r = append(r, inspector.Fail("anyone.socks_listening", "SOCKS5 port 9050 listening", anyoneSub, node, + "port 9050 NOT bound (IPFS traffic cannot route through anonymity network)", inspector.High)) + } + } + + return r +} + +func checkAnyoneCrossNode(data *inspector.ClusterData) []inspector.CheckResult { + var r []inspector.CheckResult + + // Count relay and client nodes + relayActive := 0 + relayTotal := 0 + clientActive := 0 + clientTotal := 0 + + for _, nd := range data.Nodes { + if nd.Anyone == nil { + continue + } + if nd.Anyone.RelayActive { + relayActive++ + relayTotal++ + } + if nd.Anyone.ClientActive { + clientActive++ + clientTotal++ + } + } + + // Skip cross-node checks if no Anyone services at all + if relayTotal == 0 && clientTotal == 0 { + return r + } + + // ORPort reachability: check if relays are publicly accessible from other nodes + orportChecked := 0 + orportReachable := 0 + orportFailed := 0 + + for _, nd := range data.Nodes { + if nd.Anyone == nil { + continue + } + for host, ok := range nd.Anyone.ORPortReachable { + orportChecked++ + if ok { + orportReachable++ + } else { + orportFailed++ + r = append(r, inspector.Fail("anyone.orport_reachable", + fmt.Sprintf("ORPort 9001 reachable on %s", host), + anyoneSub, nd.Node.Name(), + fmt.Sprintf("cannot TCP connect to %s:9001 from %s", host, nd.Node.Name()), inspector.High)) + } + } + } + + if orportChecked > 0 && orportFailed == 0 { + r = append(r, inspector.Pass("anyone.orport_reachable", "ORPort 9001 reachable across nodes", anyoneSub, "", + fmt.Sprintf("all %d cross-node connections OK", orportReachable), inspector.High)) + } + + return r +} diff --git a/pkg/inspector/checks/anyone_test.go b/pkg/inspector/checks/anyone_test.go new file mode 100644 index 0000000..48ca6c9 --- /dev/null +++ b/pkg/inspector/checks/anyone_test.go @@ -0,0 +1,219 @@ +package checks + +import ( + "testing" + + "github.com/DeBrosOfficial/network/pkg/inspector" +) + +func TestCheckAnyone_NilData(t *testing.T) { + nd := makeNodeData("1.1.1.1", "node") + data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd}) + results := CheckAnyone(data) + if len(results) != 0 { + t.Errorf("expected 0 results for nil Anyone data, got %d", len(results)) + } +} + +func TestCheckAnyone_BothInactive(t *testing.T) { + nd := makeNodeData("1.1.1.1", "node") + nd.Anyone = &inspector.AnyoneData{ + ORPortReachable: make(map[string]bool), + } + data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd}) + results := CheckAnyone(data) + if len(results) != 0 { + t.Errorf("expected 0 results when both services inactive, got %d", len(results)) + } +} + +func TestCheckAnyone_HealthyRelay(t *testing.T) { + nd := makeNodeData("1.1.1.1", "node") + nd.Anyone = &inspector.AnyoneData{ + RelayActive: true, + ORPortListening: true, + ControlListening: true, + Bootstrapped: true, + BootstrapPct: 100, + Fingerprint: "ABCDEF1234567890", + Nickname: "OramaRelay1", + ORPortReachable: make(map[string]bool), + } + + data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd}) + results := CheckAnyone(data) + + expectStatus(t, results, "anyone.relay_active", inspector.StatusPass) + expectStatus(t, results, "anyone.orport_listening", inspector.StatusPass) + expectStatus(t, results, "anyone.control_listening", inspector.StatusPass) + expectStatus(t, results, "anyone.bootstrapped", inspector.StatusPass) + expectStatus(t, results, "anyone.fingerprint", inspector.StatusPass) + expectStatus(t, results, "anyone.nickname", inspector.StatusPass) +} + +func TestCheckAnyone_HealthyClient(t *testing.T) { + nd := makeNodeData("1.1.1.1", "node") + nd.Anyone = &inspector.AnyoneData{ + ClientActive: true, + SocksListening: true, + ORPortReachable: make(map[string]bool), + } + + data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd}) + results := CheckAnyone(data) + + expectStatus(t, results, "anyone.client_active", inspector.StatusPass) + expectStatus(t, results, "anyone.socks_listening", inspector.StatusPass) +} + +func TestCheckAnyone_RelayORPortDown(t *testing.T) { + nd := makeNodeData("1.1.1.1", "node") + nd.Anyone = &inspector.AnyoneData{ + RelayActive: true, + ORPortListening: false, + ControlListening: true, + ORPortReachable: make(map[string]bool), + } + + data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd}) + results := CheckAnyone(data) + + expectStatus(t, results, "anyone.orport_listening", inspector.StatusFail) +} + +func TestCheckAnyone_RelayNotBootstrapped(t *testing.T) { + nd := makeNodeData("1.1.1.1", "node") + nd.Anyone = &inspector.AnyoneData{ + RelayActive: true, + ORPortListening: true, + BootstrapPct: 0, + Bootstrapped: false, + ORPortReachable: make(map[string]bool), + } + + data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd}) + results := CheckAnyone(data) + + expectStatus(t, results, "anyone.bootstrapped", inspector.StatusFail) +} + +func TestCheckAnyone_RelayPartialBootstrap(t *testing.T) { + nd := makeNodeData("1.1.1.1", "node") + nd.Anyone = &inspector.AnyoneData{ + RelayActive: true, + ORPortListening: true, + BootstrapPct: 75, + Bootstrapped: false, + ORPortReachable: make(map[string]bool), + } + + data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd}) + results := CheckAnyone(data) + + expectStatus(t, results, "anyone.bootstrapped", inspector.StatusWarn) +} + +func TestCheckAnyone_ClientSocksDown(t *testing.T) { + nd := makeNodeData("1.1.1.1", "node") + nd.Anyone = &inspector.AnyoneData{ + ClientActive: true, + SocksListening: false, + ORPortReachable: make(map[string]bool), + } + + data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd}) + results := CheckAnyone(data) + + expectStatus(t, results, "anyone.socks_listening", inspector.StatusFail) +} + +func TestCheckAnyone_NoFingerprint(t *testing.T) { + nd := makeNodeData("1.1.1.1", "node") + nd.Anyone = &inspector.AnyoneData{ + RelayActive: true, + ORPortListening: true, + Fingerprint: "", + ORPortReachable: make(map[string]bool), + } + + data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd}) + results := CheckAnyone(data) + + expectStatus(t, results, "anyone.fingerprint", inspector.StatusWarn) +} + +func TestCheckAnyone_CrossNode_ORPortReachable(t *testing.T) { + nd1 := makeNodeData("1.1.1.1", "node") + nd1.Anyone = &inspector.AnyoneData{ + RelayActive: true, + ORPortListening: true, + ORPortReachable: map[string]bool{"2.2.2.2": true}, + } + + nd2 := makeNodeData("2.2.2.2", "node") + nd2.Anyone = &inspector.AnyoneData{ + RelayActive: true, + ORPortListening: true, + ORPortReachable: map[string]bool{"1.1.1.1": true}, + } + + data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd1, "2.2.2.2": nd2}) + results := CheckAnyone(data) + + expectStatus(t, results, "anyone.orport_reachable", inspector.StatusPass) +} + +func TestCheckAnyone_CrossNode_ORPortUnreachable(t *testing.T) { + nd1 := makeNodeData("1.1.1.1", "node") + nd1.Anyone = &inspector.AnyoneData{ + RelayActive: true, + ORPortListening: true, + ORPortReachable: map[string]bool{"2.2.2.2": false}, + } + + nd2 := makeNodeData("2.2.2.2", "node") + nd2.Anyone = &inspector.AnyoneData{ + RelayActive: true, + ORPortListening: true, + ORPortReachable: map[string]bool{"1.1.1.1": true}, + } + + data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd1, "2.2.2.2": nd2}) + results := CheckAnyone(data) + + // Should have at least one fail for the unreachable connection + hasFail := false + for _, r := range results { + if r.ID == "anyone.orport_reachable" && r.Status == inspector.StatusFail { + hasFail = true + } + } + if !hasFail { + t.Error("expected at least one anyone.orport_reachable fail") + } +} + +func TestCheckAnyone_BothRelayAndClient(t *testing.T) { + nd := makeNodeData("1.1.1.1", "node") + nd.Anyone = &inspector.AnyoneData{ + RelayActive: true, + ClientActive: true, + ORPortListening: true, + SocksListening: true, + ControlListening: true, + Bootstrapped: true, + BootstrapPct: 100, + Fingerprint: "ABCDEF", + Nickname: "test", + ORPortReachable: make(map[string]bool), + } + + data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd}) + results := CheckAnyone(data) + + // Should have both relay and client checks + expectStatus(t, results, "anyone.relay_active", inspector.StatusPass) + expectStatus(t, results, "anyone.client_active", inspector.StatusPass) + expectStatus(t, results, "anyone.socks_listening", inspector.StatusPass) + expectStatus(t, results, "anyone.orport_listening", inspector.StatusPass) +} diff --git a/pkg/inspector/checks/system.go b/pkg/inspector/checks/system.go index ad125a3..ce53e2a 100644 --- a/pkg/inspector/checks/system.go +++ b/pkg/inspector/checks/system.go @@ -50,6 +50,22 @@ func checkSystemPerNode(nd *inspector.NodeData) []inspector.CheckResult { } } + // 6.2 Anyone relay/client services (only check if installed, don't fail if absent) + for _, svc := range []string{"debros-anyone-relay", "debros-anyone-client"} { + status, ok := sys.Services[svc] + if !ok || status == "inactive" { + continue // not installed or intentionally stopped + } + id := fmt.Sprintf("system.svc_%s", strings.ReplaceAll(svc, "-", "_")) + name := fmt.Sprintf("%s service active", svc) + if status == "active" { + r = append(r, inspector.Pass(id, name, systemSub, node, "active", inspector.High)) + } else { + r = append(r, inspector.Fail(id, name, systemSub, node, + fmt.Sprintf("status=%s (should be active or uninstalled)", status), inspector.High)) + } + } + // 6.5 WireGuard service if status, ok := sys.Services["wg-quick@wg0"]; ok { if status == "active" { diff --git a/pkg/inspector/collector.go b/pkg/inspector/collector.go index 520d981..7d1e61b 100644 --- a/pkg/inspector/collector.go +++ b/pkg/inspector/collector.go @@ -26,6 +26,7 @@ type NodeData struct { WireGuard *WireGuardData System *SystemData Network *NetworkData + Anyone *AnyoneData Namespaces []NamespaceData // namespace instances on this node Errors []string // collection errors for this node } @@ -224,6 +225,21 @@ type NetworkData struct { PingResults map[string]bool // WG peer IP → ping success } +// AnyoneData holds parsed Anyone relay/client status from a node. +type AnyoneData struct { + RelayActive bool // debros-anyone-relay systemd service active + ClientActive bool // debros-anyone-client systemd service active + ORPortListening bool // port 9001 bound locally + SocksListening bool // port 9050 bound locally (client SOCKS5) + ControlListening bool // port 9051 bound locally (control port) + Bootstrapped bool // relay has bootstrapped to 100% + BootstrapPct int // bootstrap percentage (0-100) + Fingerprint string // relay fingerprint + Nickname string // relay nickname + UptimeStr string // uptime from control port + ORPortReachable map[string]bool // host IP → whether we can TCP connect to their 9001 from this node +} + // Collect gathers data from all nodes in parallel. func Collect(ctx context.Context, nodes []Node, subsystems []string, verbose bool) *ClusterData { start := time.Now() @@ -246,6 +262,10 @@ func Collect(ctx context.Context, nodes []Node, subsystems []string, verbose boo } wg.Wait() + + // Second pass: cross-node ORPort reachability (needs all nodes collected first) + collectAnyoneReachability(ctx, data) + data.Duration = time.Since(start) return data } @@ -286,6 +306,9 @@ func collectNode(ctx context.Context, node Node, subsystems []string, verbose bo if shouldCollect("network") { nd.Network = collectNetwork(ctx, node, nd.WireGuard) } + if shouldCollect("anyone") { + nd.Anyone = collectAnyone(ctx, node) + } // Namespace collection — always collect if any subsystem is collected nd.Namespaces = collectNamespaces(ctx, node) @@ -1113,6 +1136,139 @@ echo "$SEP" return data } +func collectAnyone(ctx context.Context, node Node) *AnyoneData { + data := &AnyoneData{ + ORPortReachable: make(map[string]bool), + } + + cmd := ` +SEP="===INSPECTOR_SEP===" +echo "$SEP" +systemctl is-active debros-anyone-relay 2>/dev/null || echo inactive +echo "$SEP" +systemctl is-active debros-anyone-client 2>/dev/null || echo inactive +echo "$SEP" +ss -tlnp 2>/dev/null | grep -q ':9001 ' && echo yes || echo no +echo "$SEP" +ss -tlnp 2>/dev/null | grep -q ':9050 ' && echo yes || echo no +echo "$SEP" +ss -tlnp 2>/dev/null | grep -q ':9051 ' && echo yes || echo no +echo "$SEP" +# Check bootstrap status from log (last 50 lines) +grep -oP 'Bootstrapped \K[0-9]+' /var/log/anon/notices.log 2>/dev/null | tail -1 || echo 0 +echo "$SEP" +# Read fingerprint +cat /var/lib/anon/fingerprint 2>/dev/null || echo "" +echo "$SEP" +# Read nickname from config +grep -oP '^Nickname \K\S+' /etc/anon/anonrc 2>/dev/null || echo "" +` + + res := RunSSH(ctx, node, cmd) + if !res.OK() && res.Stdout == "" { + return data + } + + parts := strings.Split(res.Stdout, "===INSPECTOR_SEP===") + + if len(parts) > 1 { + data.RelayActive = strings.TrimSpace(parts[1]) == "active" + } + if len(parts) > 2 { + data.ClientActive = strings.TrimSpace(parts[2]) == "active" + } + if len(parts) > 3 { + data.ORPortListening = strings.TrimSpace(parts[3]) == "yes" + } + if len(parts) > 4 { + data.SocksListening = strings.TrimSpace(parts[4]) == "yes" + } + if len(parts) > 5 { + data.ControlListening = strings.TrimSpace(parts[5]) == "yes" + } + if len(parts) > 6 { + pct := parseIntDefault(strings.TrimSpace(parts[6]), 0) + data.BootstrapPct = pct + data.Bootstrapped = pct >= 100 + } + if len(parts) > 7 { + data.Fingerprint = strings.TrimSpace(parts[7]) + } + if len(parts) > 8 { + data.Nickname = strings.TrimSpace(parts[8]) + } + + // If neither relay nor client is active, skip further checks + if !data.RelayActive && !data.ClientActive { + return data + } + + return data +} + +// collectAnyoneReachability runs a second pass to check ORPort reachability across nodes. +// Called after all nodes are collected so we know which nodes run relays. +func collectAnyoneReachability(ctx context.Context, data *ClusterData) { + // Find all nodes running the relay (have ORPort listening) + var relayHosts []string + for host, nd := range data.Nodes { + if nd.Anyone != nil && nd.Anyone.RelayActive && nd.Anyone.ORPortListening { + relayHosts = append(relayHosts, host) + } + } + + if len(relayHosts) == 0 { + return + } + + // From each node, try to TCP connect to each relay's ORPort 9001 + var mu sync.Mutex + var wg sync.WaitGroup + + for _, nd := range data.Nodes { + if nd.Anyone == nil { + continue + } + wg.Add(1) + go func(nd *NodeData) { + defer wg.Done() + + // Build commands to test TCP connectivity to each relay + var tcpCmds string + for _, relayHost := range relayHosts { + if relayHost == nd.Node.Host { + continue // skip self + } + tcpCmds += fmt.Sprintf( + `echo "ORPORT:%s:$(timeout 3 bash -c 'echo >/dev/tcp/%s/9001' 2>/dev/null && echo ok || echo fail)" +`, relayHost, relayHost) + } + + if tcpCmds == "" { + return + } + + res := RunSSH(ctx, nd.Node, tcpCmds) + if res.Stdout == "" { + return + } + + mu.Lock() + defer mu.Unlock() + for _, line := range strings.Split(res.Stdout, "\n") { + line = strings.TrimSpace(line) + if strings.HasPrefix(line, "ORPORT:") { + p := strings.SplitN(line, ":", 3) + if len(p) == 3 { + nd.Anyone.ORPortReachable[p[1]] = p[2] == "ok" + } + } + } + }(nd) + } + wg.Wait() +} + func collectNamespaces(ctx context.Context, node Node) []NamespaceData { // Detect namespace services: debros-namespace-gateway@.service cmd := ` diff --git a/pkg/inspector/results_writer.go b/pkg/inspector/results_writer.go new file mode 100644 index 0000000..cf71ed6 --- /dev/null +++ b/pkg/inspector/results_writer.go @@ -0,0 +1,354 @@ +package inspector + +import ( + "fmt" + "os" + "path/filepath" + "sort" + "strings" + "time" +) + +// FailureGroup groups identical check failures/warnings across nodes. +type FailureGroup struct { + ID string + Name string // from first check in group + Status Status + Severity Severity + Subsystem string + Nodes []string // affected node names (deduplicated) + Messages []string // unique messages (capped at 5) + Count int // total raw occurrence count (before dedup) +} + +// GroupFailures collapses CheckResults into unique failure groups keyed by (ID, Status). +// Only failures and warnings are grouped; passes and skips are ignored. +func GroupFailures(results *Results) []FailureGroup { + type groupKey struct { + ID string + Status Status + } + + seen := map[groupKey]*FailureGroup{} + nodesSeen := map[groupKey]map[string]bool{} + var order []groupKey + + for _, c := range results.Checks { + if c.Status != StatusFail && c.Status != StatusWarn { + continue + } + k := groupKey{ID: c.ID, Status: c.Status} + g, exists := seen[k] + if !exists { + g = &FailureGroup{ + ID: c.ID, + Name: c.Name, + Status: c.Status, + Severity: c.Severity, + Subsystem: c.Subsystem, + } + seen[k] = g + nodesSeen[k] = map[string]bool{} + order = append(order, k) + } + g.Count++ + node := c.Node + if node == "" { + node = "cluster-wide" + } + // Deduplicate nodes (a node may appear for multiple targets) + if !nodesSeen[k][node] { + nodesSeen[k][node] = true + g.Nodes = append(g.Nodes, node) + } + // Track unique messages (cap at 5 to avoid bloat) + if len(g.Messages) < 5 { + found := false + for _, m := range g.Messages { + if m == c.Message { + found = true + break + } + } + if !found { + g.Messages = append(g.Messages, c.Message) + } + } + } + + // Sort: failures before warnings, then by severity (high first), then by ID + groups := make([]FailureGroup, 0, len(order)) + for _, k := range order { + groups = append(groups, *seen[k]) + } + sort.Slice(groups, func(i, j int) bool { + oi, oj := statusOrder(groups[i].Status), statusOrder(groups[j].Status) + if oi != oj { + return oi < oj + } + if groups[i].Severity != groups[j].Severity { + return groups[i].Severity > groups[j].Severity + } + return groups[i].ID < groups[j].ID + }) + + return groups +} + +// WriteResults saves inspection results as markdown files to a timestamped directory. +// Returns the output directory path. +func WriteResults(baseDir, env string, results *Results, data *ClusterData, analysis *AnalysisResult) (string, error) { + ts := time.Now().Format("2006-01-02_150405") + dir := filepath.Join(baseDir, env, ts) + + if err := os.MkdirAll(dir, 0o755); err != nil { + return "", fmt.Errorf("create output directory: %w", err) + } + + groups := GroupFailures(results) + + // Build analysis lookup: groupID -> analysis text + analysisMap := map[string]string{} + if analysis != nil { + for _, sa := range analysis.Analyses { + key := sa.GroupID + if key == "" { + key = sa.Subsystem + } + if sa.Error == nil { + analysisMap[key] = sa.Analysis + } + } + } + + // Write summary.md + if err := writeSummary(dir, env, ts, results, data, groups, analysisMap); err != nil { + return "", fmt.Errorf("write summary: %w", err) + } + + // Group checks by subsystem for per-subsystem files + checksBySubsystem := map[string][]CheckResult{} + for _, c := range results.Checks { + checksBySubsystem[c.Subsystem] = append(checksBySubsystem[c.Subsystem], c) + } + + groupsBySubsystem := map[string][]FailureGroup{} + for _, g := range groups { + groupsBySubsystem[g.Subsystem] = append(groupsBySubsystem[g.Subsystem], g) + } + + // Write per-subsystem files + for sub, checks := range checksBySubsystem { + subGroups := groupsBySubsystem[sub] + if err := writeSubsystem(dir, sub, ts, checks, subGroups, analysisMap); err != nil { + return "", fmt.Errorf("write %s: %w", sub, err) + } + } + + return dir, nil +} + +func writeSummary(dir, env, ts string, results *Results, data *ClusterData, groups []FailureGroup, analysisMap map[string]string) error { + var b strings.Builder + passed, failed, warned, skipped := results.Summary() + + b.WriteString(fmt.Sprintf("# %s Inspection Report\n\n", strings.ToUpper(env))) + b.WriteString(fmt.Sprintf("**Date:** %s \n", ts)) + b.WriteString(fmt.Sprintf("**Nodes:** %d \n", len(data.Nodes))) + b.WriteString(fmt.Sprintf("**Total:** %d passed, %d failed, %d warnings, %d skipped \n\n", passed, failed, warned, skipped)) + + // Per-subsystem table + subStats := map[string][4]int{} // [pass, fail, warn, skip] + var subsystems []string + for _, c := range results.Checks { + if _, exists := subStats[c.Subsystem]; !exists { + subsystems = append(subsystems, c.Subsystem) + } + s := subStats[c.Subsystem] + switch c.Status { + case StatusPass: + s[0]++ + case StatusFail: + s[1]++ + case StatusWarn: + s[2]++ + case StatusSkip: + s[3]++ + } + subStats[c.Subsystem] = s + } + sort.Strings(subsystems) + + // Count issue groups per subsystem + issueCountBySub := map[string]int{} + for _, g := range groups { + issueCountBySub[g.Subsystem]++ + } + + b.WriteString("## Subsystems\n\n") + b.WriteString("| Subsystem | Pass | Fail | Warn | Skip | Issues |\n") + b.WriteString("|-----------|------|------|------|------|--------|\n") + for _, sub := range subsystems { + s := subStats[sub] + issues := issueCountBySub[sub] + link := fmt.Sprintf("[%s](%s.md)", sub, sub) + b.WriteString(fmt.Sprintf("| %s | %d | %d | %d | %d | %d |\n", link, s[0], s[1], s[2], s[3], issues)) + } + b.WriteString("\n") + + // Critical issues section + critical := filterGroupsBySeverity(groups, High) + if len(critical) > 0 { + b.WriteString("## Critical Issues\n\n") + for i, g := range critical { + icon := "FAIL" + if g.Status == StatusWarn { + icon = "WARN" + } + nodeInfo := fmt.Sprintf("%d nodes", len(g.Nodes)) + if g.Count > len(g.Nodes) { + nodeInfo = fmt.Sprintf("%d nodes (%d occurrences)", len(g.Nodes), g.Count) + } + b.WriteString(fmt.Sprintf("%d. **[%s]** %s — %s \n", i+1, icon, g.Name, nodeInfo)) + b.WriteString(fmt.Sprintf(" *%s* → [details](%s.md#%s) \n", + g.Messages[0], g.Subsystem, anchorID(g.Name))) + } + b.WriteString("\n") + } + + // Collection errors + var errs []string + for _, nd := range data.Nodes { + for _, e := range nd.Errors { + errs = append(errs, fmt.Sprintf("- **%s**: %s", nd.Node.Name(), e)) + } + } + if len(errs) > 0 { + b.WriteString("## Collection Errors\n\n") + for _, e := range errs { + b.WriteString(e + "\n") + } + b.WriteString("\n") + } + + return os.WriteFile(filepath.Join(dir, "summary.md"), []byte(b.String()), 0o644) +} + +func writeSubsystem(dir, subsystem, ts string, checks []CheckResult, groups []FailureGroup, analysisMap map[string]string) error { + var b strings.Builder + + // Count + var passed, failed, warned, skipped int + for _, c := range checks { + switch c.Status { + case StatusPass: + passed++ + case StatusFail: + failed++ + case StatusWarn: + warned++ + case StatusSkip: + skipped++ + } + } + + b.WriteString(fmt.Sprintf("# %s\n\n", strings.ToUpper(subsystem))) + b.WriteString(fmt.Sprintf("**Date:** %s \n", ts)) + b.WriteString(fmt.Sprintf("**Checks:** %d passed, %d failed, %d warnings, %d skipped \n\n", passed, failed, warned, skipped)) + + // Issues section + if len(groups) > 0 { + b.WriteString("## Issues\n\n") + for i, g := range groups { + icon := "FAIL" + if g.Status == StatusWarn { + icon = "WARN" + } + b.WriteString(fmt.Sprintf("### %d. %s\n\n", i+1, g.Name)) + nodeInfo := fmt.Sprintf("%d nodes", len(g.Nodes)) + if g.Count > len(g.Nodes) { + nodeInfo = fmt.Sprintf("%d nodes (%d occurrences)", len(g.Nodes), g.Count) + } + b.WriteString(fmt.Sprintf("**Status:** %s | **Severity:** %s | **Affected:** %s \n\n", icon, g.Severity, nodeInfo)) + + // Affected nodes + b.WriteString("**Affected nodes:**\n") + for _, n := range g.Nodes { + b.WriteString(fmt.Sprintf("- `%s`\n", n)) + } + b.WriteString("\n") + + // Messages + if len(g.Messages) == 1 { + b.WriteString(fmt.Sprintf("**Detail:** %s\n\n", g.Messages[0])) + } else { + b.WriteString("**Details:**\n") + for _, m := range g.Messages { + b.WriteString(fmt.Sprintf("- %s\n", m)) + } + b.WriteString("\n") + } + + // AI analysis (if available) + if ai, ok := analysisMap[g.ID]; ok { + b.WriteString(ai) + b.WriteString("\n\n") + } + + b.WriteString("---\n\n") + } + } + + // All checks table + b.WriteString("## All Checks\n\n") + b.WriteString("| Status | Severity | Check | Node | Detail |\n") + b.WriteString("|--------|----------|-------|------|--------|\n") + + // Sort: failures first + sorted := make([]CheckResult, len(checks)) + copy(sorted, checks) + sort.Slice(sorted, func(i, j int) bool { + oi, oj := statusOrder(sorted[i].Status), statusOrder(sorted[j].Status) + if oi != oj { + return oi < oj + } + if sorted[i].Severity != sorted[j].Severity { + return sorted[i].Severity > sorted[j].Severity + } + return sorted[i].ID < sorted[j].ID + }) + + for _, c := range sorted { + node := c.Node + if node == "" { + node = "cluster-wide" + } + msg := strings.ReplaceAll(c.Message, "|", "\\|") + b.WriteString(fmt.Sprintf("| %s | %s | %s | %s | %s |\n", + statusIcon(c.Status), c.Severity, c.Name, node, msg)) + } + + return os.WriteFile(filepath.Join(dir, subsystem+".md"), []byte(b.String()), 0o644) +} + +func filterGroupsBySeverity(groups []FailureGroup, minSeverity Severity) []FailureGroup { + var out []FailureGroup + for _, g := range groups { + if g.Severity >= minSeverity { + out = append(out, g) + } + } + return out +} + +func anchorID(name string) string { + s := strings.ToLower(name) + s = strings.ReplaceAll(s, " ", "-") + s = strings.Map(func(r rune) rune { + if (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9') || r == '-' { + return r + } + return -1 + }, s) + return s +}