mirror of
https://github.com/DeBrosOfficial/orama.git
synced 2026-03-17 08:36:57 +00:00
Added system prompt on inspector
This commit is contained in:
parent
051c002ec8
commit
35ad8bdb16
@ -141,7 +141,12 @@ func HandleInspectCommand(args []string) {
|
|||||||
if len(issues) == 0 {
|
if len(issues) == 0 {
|
||||||
fmt.Printf("\nAll checks passed — no AI analysis needed.\n")
|
fmt.Printf("\nAll checks passed — no AI analysis needed.\n")
|
||||||
} else {
|
} else {
|
||||||
fmt.Printf("\nAnalyzing %d issues with %s...\n", len(issues), *aiModel)
|
// Count affected subsystems
|
||||||
|
subs := map[string]bool{}
|
||||||
|
for _, c := range issues {
|
||||||
|
subs[c.Subsystem] = true
|
||||||
|
}
|
||||||
|
fmt.Printf("\nAnalyzing %d issues across %d subsystems with %s...\n", len(issues), len(subs), *aiModel)
|
||||||
analysis, err := inspector.Analyze(results, data, *aiModel, *aiAPIKey)
|
analysis, err := inspector.Analyze(results, data, *aiModel, *aiAPIKey)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
fmt.Fprintf(os.Stderr, "\nAI analysis failed: %v\n", err)
|
fmt.Fprintf(os.Stderr, "\nAI analysis failed: %v\n", err)
|
||||||
|
|||||||
@ -7,18 +7,68 @@ import (
|
|||||||
"io"
|
"io"
|
||||||
"net/http"
|
"net/http"
|
||||||
"os"
|
"os"
|
||||||
|
"sort"
|
||||||
"strings"
|
"strings"
|
||||||
|
"sync"
|
||||||
"time"
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// System prompt with architecture context and remediation knowledge.
|
||||||
|
const systemPrompt = `You are a distributed systems expert analyzing health check results for an Orama Network cluster.
|
||||||
|
|
||||||
|
## Architecture
|
||||||
|
- **RQLite**: Raft consensus SQLite database. Requires N/2+1 quorum for writes. Each node runs one instance.
|
||||||
|
- **Olric**: Distributed in-memory cache using memberlist protocol. Coordinates via elected coordinator node.
|
||||||
|
- **IPFS**: Decentralized storage with private swarm (swarm key). Runs Kubo daemon + IPFS Cluster for pinning.
|
||||||
|
- **CoreDNS + Caddy**: DNS resolution (port 53) and TLS termination (ports 80/443). Only on nameserver nodes.
|
||||||
|
- **WireGuard**: Mesh VPN connecting all nodes via 10.0.0.0/8 on port 51820. All inter-node traffic goes over WG.
|
||||||
|
- **Namespaces**: Isolated tenant environments. Each namespace runs its own RQLite + Olric + Gateway on a 5-port block (base+0=RQLite HTTP, +1=Raft, +2=Olric HTTP, +3=Memberlist, +4=Gateway).
|
||||||
|
|
||||||
|
## Common Failure Patterns
|
||||||
|
- If WireGuard is down on a node, ALL services on that node will appear unreachable from other nodes.
|
||||||
|
- RQLite losing quorum (< N/2+1 voters) means the cluster cannot accept writes. Reads may still work.
|
||||||
|
- Olric suspects/flapping in logs usually means unstable network between nodes (check WireGuard first).
|
||||||
|
- IPFS swarm peers dropping to 0 means the node is isolated from the private swarm.
|
||||||
|
- High TCP retransmission (>2%) indicates packet loss, often due to WireGuard MTU issues.
|
||||||
|
|
||||||
|
## Service Management
|
||||||
|
- ALWAYS use the CLI for service operations: ` + "`sudo orama prod restart`" + `, ` + "`sudo orama prod stop`" + `, ` + "`sudo orama prod start`" + `
|
||||||
|
- NEVER use raw systemctl commands (they skip important lifecycle hooks).
|
||||||
|
- For rolling restarts: upgrade followers first, leader LAST, one node at a time.
|
||||||
|
- Check RQLite leader: ` + "`curl -s localhost:4001/status | python3 -c \"import sys,json; print(json.load(sys.stdin)['store']['raft']['state'])\"`" + `
|
||||||
|
|
||||||
|
## Response Format
|
||||||
|
Respond in this exact structure:
|
||||||
|
|
||||||
|
### Root Cause
|
||||||
|
What is causing these failures? If multiple issues, explain each briefly.
|
||||||
|
|
||||||
|
### Impact
|
||||||
|
What is broken for users right now? Can they still deploy apps, access services?
|
||||||
|
|
||||||
|
### Fix
|
||||||
|
Step-by-step commands to resolve. Include actual node IPs/names from the data when possible.
|
||||||
|
|
||||||
|
### Prevention
|
||||||
|
What could prevent this in the future? (omit if not applicable)`
|
||||||
|
|
||||||
|
// SubsystemAnalysis holds the AI analysis for a single subsystem.
|
||||||
|
type SubsystemAnalysis struct {
|
||||||
|
Subsystem string
|
||||||
|
Analysis string
|
||||||
|
Duration time.Duration
|
||||||
|
Error error
|
||||||
|
}
|
||||||
|
|
||||||
// AnalysisResult holds the AI's analysis of check failures.
|
// AnalysisResult holds the AI's analysis of check failures.
|
||||||
type AnalysisResult struct {
|
type AnalysisResult struct {
|
||||||
Model string
|
Model string
|
||||||
Analysis string
|
Analyses []SubsystemAnalysis
|
||||||
Duration time.Duration
|
Duration time.Duration
|
||||||
}
|
}
|
||||||
|
|
||||||
// Analyze sends failures and cluster context to OpenRouter for AI analysis.
|
// Analyze sends failures and cluster context to OpenRouter for AI analysis.
|
||||||
|
// Each subsystem with issues gets its own API call, run in parallel.
|
||||||
func Analyze(results *Results, data *ClusterData, model, apiKey string) (*AnalysisResult, error) {
|
func Analyze(results *Results, data *ClusterData, model, apiKey string) (*AnalysisResult, error) {
|
||||||
if apiKey == "" {
|
if apiKey == "" {
|
||||||
apiKey = os.Getenv("OPENROUTER_API_KEY")
|
apiKey = os.Getenv("OPENROUTER_API_KEY")
|
||||||
@ -27,45 +77,148 @@ func Analyze(results *Results, data *ClusterData, model, apiKey string) (*Analys
|
|||||||
return nil, fmt.Errorf("no API key: set --api-key or OPENROUTER_API_KEY env")
|
return nil, fmt.Errorf("no API key: set --api-key or OPENROUTER_API_KEY env")
|
||||||
}
|
}
|
||||||
|
|
||||||
// Build the prompt with failures, warnings, and cluster context
|
// Group failures and warnings by subsystem
|
||||||
prompt := buildAnalysisPrompt(results, data)
|
issues := results.FailuresAndWarnings()
|
||||||
|
bySubsystem := map[string][]CheckResult{}
|
||||||
start := time.Now()
|
for _, c := range issues {
|
||||||
response, err := callOpenRouter(model, apiKey, prompt)
|
bySubsystem[c.Subsystem] = append(bySubsystem[c.Subsystem], c)
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("OpenRouter API call failed: %w", err)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if len(bySubsystem) == 0 {
|
||||||
|
return &AnalysisResult{Model: model}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Build healthy summary (subsystems with zero failures/warnings)
|
||||||
|
healthySummary := buildHealthySummary(results, bySubsystem)
|
||||||
|
|
||||||
|
// Build collection errors summary
|
||||||
|
collectionErrors := buildCollectionErrors(data)
|
||||||
|
|
||||||
|
// Build cluster overview (shared across all calls)
|
||||||
|
clusterOverview := buildClusterOverview(data, results)
|
||||||
|
|
||||||
|
// Launch one AI call per subsystem in parallel
|
||||||
|
start := time.Now()
|
||||||
|
var mu sync.Mutex
|
||||||
|
var wg sync.WaitGroup
|
||||||
|
var analyses []SubsystemAnalysis
|
||||||
|
|
||||||
|
// Sort subsystems for deterministic ordering
|
||||||
|
subsystems := make([]string, 0, len(bySubsystem))
|
||||||
|
for sub := range bySubsystem {
|
||||||
|
subsystems = append(subsystems, sub)
|
||||||
|
}
|
||||||
|
sort.Strings(subsystems)
|
||||||
|
|
||||||
|
for _, sub := range subsystems {
|
||||||
|
checks := bySubsystem[sub]
|
||||||
|
wg.Add(1)
|
||||||
|
go func(subsystem string, checks []CheckResult) {
|
||||||
|
defer wg.Done()
|
||||||
|
|
||||||
|
prompt := buildSubsystemPrompt(subsystem, checks, data, clusterOverview, healthySummary, collectionErrors)
|
||||||
|
subStart := time.Now()
|
||||||
|
response, err := callOpenRouter(model, apiKey, prompt)
|
||||||
|
|
||||||
|
sa := SubsystemAnalysis{
|
||||||
|
Subsystem: subsystem,
|
||||||
|
Duration: time.Since(subStart),
|
||||||
|
}
|
||||||
|
if err != nil {
|
||||||
|
sa.Error = err
|
||||||
|
} else {
|
||||||
|
sa.Analysis = response
|
||||||
|
}
|
||||||
|
|
||||||
|
mu.Lock()
|
||||||
|
analyses = append(analyses, sa)
|
||||||
|
mu.Unlock()
|
||||||
|
}(sub, checks)
|
||||||
|
}
|
||||||
|
wg.Wait()
|
||||||
|
|
||||||
|
// Sort by subsystem name for consistent output
|
||||||
|
sort.Slice(analyses, func(i, j int) bool {
|
||||||
|
return analyses[i].Subsystem < analyses[j].Subsystem
|
||||||
|
})
|
||||||
|
|
||||||
return &AnalysisResult{
|
return &AnalysisResult{
|
||||||
Model: model,
|
Model: model,
|
||||||
Analysis: response,
|
Analyses: analyses,
|
||||||
Duration: time.Since(start),
|
Duration: time.Since(start),
|
||||||
}, nil
|
}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func buildAnalysisPrompt(results *Results, data *ClusterData) string {
|
func buildClusterOverview(data *ClusterData, results *Results) string {
|
||||||
var b strings.Builder
|
var b strings.Builder
|
||||||
|
b.WriteString(fmt.Sprintf("Nodes: %d\n", len(data.Nodes)))
|
||||||
// System context
|
|
||||||
b.WriteString("You are a distributed systems expert analyzing health check results for an Orama Network cluster.\n")
|
|
||||||
b.WriteString("The cluster runs RQLite (Raft consensus), Olric (distributed cache), IPFS, CoreDNS, and WireGuard.\n\n")
|
|
||||||
|
|
||||||
// Cluster overview
|
|
||||||
b.WriteString("## Cluster Overview\n")
|
|
||||||
b.WriteString(fmt.Sprintf("Nodes inspected: %d\n", len(data.Nodes)))
|
|
||||||
for host, nd := range data.Nodes {
|
for host, nd := range data.Nodes {
|
||||||
b.WriteString(fmt.Sprintf("- %s (role: %s)\n", host, nd.Node.Role))
|
b.WriteString(fmt.Sprintf("- %s (role: %s)\n", host, nd.Node.Role))
|
||||||
}
|
}
|
||||||
|
passed, failed, warned, skipped := results.Summary()
|
||||||
|
b.WriteString(fmt.Sprintf("\nCheck totals: %d passed, %d failed, %d warnings, %d skipped\n", passed, failed, warned, skipped))
|
||||||
|
return b.String()
|
||||||
|
}
|
||||||
|
|
||||||
|
func buildHealthySummary(results *Results, issueSubsystems map[string][]CheckResult) string {
|
||||||
|
// Count passes per subsystem
|
||||||
|
passBySubsystem := map[string]int{}
|
||||||
|
totalBySubsystem := map[string]int{}
|
||||||
|
for _, c := range results.Checks {
|
||||||
|
totalBySubsystem[c.Subsystem]++
|
||||||
|
if c.Status == StatusPass {
|
||||||
|
passBySubsystem[c.Subsystem]++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var b strings.Builder
|
||||||
|
for sub, total := range totalBySubsystem {
|
||||||
|
if _, hasIssues := issueSubsystems[sub]; hasIssues {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
passed := passBySubsystem[sub]
|
||||||
|
if passed == total && total > 0 {
|
||||||
|
b.WriteString(fmt.Sprintf("- %s: all %d checks pass\n", sub, total))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if b.Len() == 0 {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
return b.String()
|
||||||
|
}
|
||||||
|
|
||||||
|
func buildCollectionErrors(data *ClusterData) string {
|
||||||
|
var b strings.Builder
|
||||||
|
for _, nd := range data.Nodes {
|
||||||
|
if len(nd.Errors) > 0 {
|
||||||
|
for _, e := range nd.Errors {
|
||||||
|
b.WriteString(fmt.Sprintf("- %s: %s\n", nd.Node.Name(), e))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return b.String()
|
||||||
|
}
|
||||||
|
|
||||||
|
func buildSubsystemPrompt(subsystem string, checks []CheckResult, data *ClusterData, clusterOverview, healthySummary, collectionErrors string) string {
|
||||||
|
var b strings.Builder
|
||||||
|
|
||||||
|
b.WriteString("## Cluster Overview\n")
|
||||||
|
b.WriteString(clusterOverview)
|
||||||
b.WriteString("\n")
|
b.WriteString("\n")
|
||||||
|
|
||||||
// Summary
|
// Failures
|
||||||
passed, failed, warned, skipped := results.Summary()
|
var failures, warnings []CheckResult
|
||||||
b.WriteString(fmt.Sprintf("## Check Results: %d passed, %d failed, %d warnings, %d skipped\n\n", passed, failed, warned, skipped))
|
for _, c := range checks {
|
||||||
|
if c.Status == StatusFail {
|
||||||
|
failures = append(failures, c)
|
||||||
|
} else if c.Status == StatusWarn {
|
||||||
|
warnings = append(warnings, c)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// List all failures
|
|
||||||
failures := results.Failures()
|
|
||||||
if len(failures) > 0 {
|
if len(failures) > 0 {
|
||||||
b.WriteString("## Failures (CRITICAL)\n")
|
b.WriteString(fmt.Sprintf("## %s Failures\n", strings.ToUpper(subsystem)))
|
||||||
for _, f := range failures {
|
for _, f := range failures {
|
||||||
node := f.Node
|
node := f.Node
|
||||||
if node == "" {
|
if node == "" {
|
||||||
@ -76,17 +229,9 @@ func buildAnalysisPrompt(results *Results, data *ClusterData) string {
|
|||||||
b.WriteString("\n")
|
b.WriteString("\n")
|
||||||
}
|
}
|
||||||
|
|
||||||
// List all warnings
|
if len(warnings) > 0 {
|
||||||
warnings := results.FailuresAndWarnings()
|
b.WriteString(fmt.Sprintf("## %s Warnings\n", strings.ToUpper(subsystem)))
|
||||||
warningsOnly := make([]CheckResult, 0)
|
|
||||||
for _, w := range warnings {
|
for _, w := range warnings {
|
||||||
if w.Status == StatusWarn {
|
|
||||||
warningsOnly = append(warningsOnly, w)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if len(warningsOnly) > 0 {
|
|
||||||
b.WriteString("## Warnings\n")
|
|
||||||
for _, w := range warningsOnly {
|
|
||||||
node := w.Node
|
node := w.Node
|
||||||
if node == "" {
|
if node == "" {
|
||||||
node = "cluster-wide"
|
node = "cluster-wide"
|
||||||
@ -96,16 +241,83 @@ func buildAnalysisPrompt(results *Results, data *ClusterData) string {
|
|||||||
b.WriteString("\n")
|
b.WriteString("\n")
|
||||||
}
|
}
|
||||||
|
|
||||||
// Add raw RQLite status for context (condensed)
|
// Subsystem-specific raw data
|
||||||
b.WriteString("## Raw Cluster Data (condensed)\n")
|
contextData := buildSubsystemContext(subsystem, data)
|
||||||
|
if contextData != "" {
|
||||||
|
b.WriteString(fmt.Sprintf("## %s Raw Data\n", strings.ToUpper(subsystem)))
|
||||||
|
b.WriteString(contextData)
|
||||||
|
b.WriteString("\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Healthy subsystems for cross-reference
|
||||||
|
if healthySummary != "" {
|
||||||
|
b.WriteString("## Healthy Subsystems (for context)\n")
|
||||||
|
b.WriteString(healthySummary)
|
||||||
|
b.WriteString("\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Collection errors
|
||||||
|
if collectionErrors != "" {
|
||||||
|
b.WriteString("## Collection Errors\n")
|
||||||
|
b.WriteString(collectionErrors)
|
||||||
|
b.WriteString("\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
b.WriteString(fmt.Sprintf("\nAnalyze the %s issues above.\n", subsystem))
|
||||||
|
return b.String()
|
||||||
|
}
|
||||||
|
|
||||||
|
// buildSubsystemContext dispatches to the right context builder.
|
||||||
|
func buildSubsystemContext(subsystem string, data *ClusterData) string {
|
||||||
|
switch subsystem {
|
||||||
|
case "rqlite":
|
||||||
|
return buildRQLiteContext(data)
|
||||||
|
case "olric":
|
||||||
|
return buildOlricContext(data)
|
||||||
|
case "ipfs":
|
||||||
|
return buildIPFSContext(data)
|
||||||
|
case "dns":
|
||||||
|
return buildDNSContext(data)
|
||||||
|
case "wireguard":
|
||||||
|
return buildWireGuardContext(data)
|
||||||
|
case "system":
|
||||||
|
return buildSystemContext(data)
|
||||||
|
case "network":
|
||||||
|
return buildNetworkContext(data)
|
||||||
|
case "namespace":
|
||||||
|
return buildNamespaceContext(data)
|
||||||
|
default:
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func buildRQLiteContext(data *ClusterData) string {
|
||||||
|
var b strings.Builder
|
||||||
for host, nd := range data.Nodes {
|
for host, nd := range data.Nodes {
|
||||||
if nd.RQLite != nil && nd.RQLite.Status != nil {
|
if nd.RQLite == nil {
|
||||||
s := nd.RQLite.Status
|
continue
|
||||||
b.WriteString(fmt.Sprintf("### %s (RQLite)\n", host))
|
}
|
||||||
|
b.WriteString(fmt.Sprintf("### %s\n", host))
|
||||||
|
if !nd.RQLite.Responsive {
|
||||||
|
b.WriteString(" NOT RESPONDING\n")
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if s := nd.RQLite.Status; s != nil {
|
||||||
b.WriteString(fmt.Sprintf(" raft_state=%s term=%d applied=%d commit=%d leader=%s peers=%d voter=%v\n",
|
b.WriteString(fmt.Sprintf(" raft_state=%s term=%d applied=%d commit=%d leader=%s peers=%d voter=%v\n",
|
||||||
s.RaftState, s.Term, s.AppliedIndex, s.CommitIndex, s.LeaderNodeID, s.NumPeers, s.Voter))
|
s.RaftState, s.Term, s.AppliedIndex, s.CommitIndex, s.LeaderNodeID, s.NumPeers, s.Voter))
|
||||||
|
b.WriteString(fmt.Sprintf(" fsm_pending=%d db_size=%s version=%s goroutines=%d uptime=%s\n",
|
||||||
|
s.FsmPending, s.DBSizeFriendly, s.Version, s.Goroutines, s.Uptime))
|
||||||
|
}
|
||||||
|
if r := nd.RQLite.Readyz; r != nil {
|
||||||
|
b.WriteString(fmt.Sprintf(" readyz=%v store=%s leader=%s\n", r.Ready, r.Store, r.Leader))
|
||||||
|
}
|
||||||
|
if d := nd.RQLite.DebugVars; d != nil {
|
||||||
|
b.WriteString(fmt.Sprintf(" query_errors=%d execute_errors=%d leader_not_found=%d snapshot_errors=%d\n",
|
||||||
|
d.QueryErrors, d.ExecuteErrors, d.LeaderNotFound, d.SnapshotErrors))
|
||||||
|
}
|
||||||
|
b.WriteString(fmt.Sprintf(" strong_read=%v\n", nd.RQLite.StrongRead))
|
||||||
if nd.RQLite.Nodes != nil {
|
if nd.RQLite.Nodes != nil {
|
||||||
b.WriteString(fmt.Sprintf(" /nodes reports %d members:", len(nd.RQLite.Nodes)))
|
b.WriteString(fmt.Sprintf(" /nodes (%d members):", len(nd.RQLite.Nodes)))
|
||||||
for addr, n := range nd.RQLite.Nodes {
|
for addr, n := range nd.RQLite.Nodes {
|
||||||
reachable := "ok"
|
reachable := "ok"
|
||||||
if !n.Reachable {
|
if !n.Reachable {
|
||||||
@ -120,15 +332,157 @@ func buildAnalysisPrompt(results *Results, data *ClusterData) string {
|
|||||||
b.WriteString("\n")
|
b.WriteString("\n")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
return b.String()
|
||||||
}
|
}
|
||||||
|
|
||||||
b.WriteString("\n## Task\n")
|
func buildOlricContext(data *ClusterData) string {
|
||||||
b.WriteString("Analyze the failures and warnings above. For each issue:\n")
|
var b strings.Builder
|
||||||
b.WriteString("1. Explain the root cause\n")
|
for host, nd := range data.Nodes {
|
||||||
b.WriteString("2. Assess the severity and impact on the cluster\n")
|
if nd.Olric == nil {
|
||||||
b.WriteString("3. Suggest specific commands or actions to fix it\n")
|
continue
|
||||||
b.WriteString("\nBe concise and actionable. Group related issues together. Use markdown formatting.\n")
|
}
|
||||||
|
o := nd.Olric
|
||||||
|
b.WriteString(fmt.Sprintf("### %s\n", host))
|
||||||
|
b.WriteString(fmt.Sprintf(" active=%v memberlist=%v members=%d coordinator=%s\n",
|
||||||
|
o.ServiceActive, o.MemberlistUp, o.MemberCount, o.Coordinator))
|
||||||
|
b.WriteString(fmt.Sprintf(" memory=%dMB restarts=%d log_errors=%d suspects=%d flapping=%d\n",
|
||||||
|
o.ProcessMemMB, o.RestartCount, o.LogErrors, o.LogSuspects, o.LogFlapping))
|
||||||
|
}
|
||||||
|
return b.String()
|
||||||
|
}
|
||||||
|
|
||||||
|
func buildIPFSContext(data *ClusterData) string {
|
||||||
|
var b strings.Builder
|
||||||
|
for host, nd := range data.Nodes {
|
||||||
|
if nd.IPFS == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
ip := nd.IPFS
|
||||||
|
repoPct := 0.0
|
||||||
|
if ip.RepoMaxBytes > 0 {
|
||||||
|
repoPct = float64(ip.RepoSizeBytes) / float64(ip.RepoMaxBytes) * 100
|
||||||
|
}
|
||||||
|
b.WriteString(fmt.Sprintf("### %s\n", host))
|
||||||
|
b.WriteString(fmt.Sprintf(" daemon=%v cluster=%v swarm_peers=%d cluster_peers=%d cluster_errors=%d\n",
|
||||||
|
ip.DaemonActive, ip.ClusterActive, ip.SwarmPeerCount, ip.ClusterPeerCount, ip.ClusterErrors))
|
||||||
|
b.WriteString(fmt.Sprintf(" repo=%.0f%% (%d/%d bytes) kubo=%s cluster=%s\n",
|
||||||
|
repoPct, ip.RepoSizeBytes, ip.RepoMaxBytes, ip.KuboVersion, ip.ClusterVersion))
|
||||||
|
b.WriteString(fmt.Sprintf(" swarm_key=%v bootstrap_empty=%v\n", ip.HasSwarmKey, ip.BootstrapEmpty))
|
||||||
|
}
|
||||||
|
return b.String()
|
||||||
|
}
|
||||||
|
|
||||||
|
func buildDNSContext(data *ClusterData) string {
|
||||||
|
var b strings.Builder
|
||||||
|
for host, nd := range data.Nodes {
|
||||||
|
if nd.DNS == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
d := nd.DNS
|
||||||
|
b.WriteString(fmt.Sprintf("### %s\n", host))
|
||||||
|
b.WriteString(fmt.Sprintf(" coredns=%v caddy=%v ports(53=%v,80=%v,443=%v) corefile=%v\n",
|
||||||
|
d.CoreDNSActive, d.CaddyActive, d.Port53Bound, d.Port80Bound, d.Port443Bound, d.CorefileExists))
|
||||||
|
b.WriteString(fmt.Sprintf(" memory=%dMB restarts=%d log_errors=%d\n",
|
||||||
|
d.CoreDNSMemMB, d.CoreDNSRestarts, d.LogErrors))
|
||||||
|
b.WriteString(fmt.Sprintf(" resolve: SOA=%v NS=%v(count=%d) wildcard=%v base_A=%v\n",
|
||||||
|
d.SOAResolves, d.NSResolves, d.NSRecordCount, d.WildcardResolves, d.BaseAResolves))
|
||||||
|
b.WriteString(fmt.Sprintf(" tls: base=%d days, wildcard=%d days\n",
|
||||||
|
d.BaseTLSDaysLeft, d.WildTLSDaysLeft))
|
||||||
|
}
|
||||||
|
return b.String()
|
||||||
|
}
|
||||||
|
|
||||||
|
func buildWireGuardContext(data *ClusterData) string {
|
||||||
|
var b strings.Builder
|
||||||
|
for host, nd := range data.Nodes {
|
||||||
|
if nd.WireGuard == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
wg := nd.WireGuard
|
||||||
|
b.WriteString(fmt.Sprintf("### %s\n", host))
|
||||||
|
b.WriteString(fmt.Sprintf(" interface=%v service=%v ip=%s port=%d peers=%d mtu=%d\n",
|
||||||
|
wg.InterfaceUp, wg.ServiceActive, wg.WgIP, wg.ListenPort, wg.PeerCount, wg.MTU))
|
||||||
|
b.WriteString(fmt.Sprintf(" config=%v perms=%s\n", wg.ConfigExists, wg.ConfigPerms))
|
||||||
|
for _, p := range wg.Peers {
|
||||||
|
age := "never"
|
||||||
|
if p.LatestHandshake > 0 {
|
||||||
|
age = fmt.Sprintf("%ds ago", time.Now().Unix()-p.LatestHandshake)
|
||||||
|
}
|
||||||
|
keyPrefix := p.PublicKey
|
||||||
|
if len(keyPrefix) > 8 {
|
||||||
|
keyPrefix = keyPrefix[:8] + "..."
|
||||||
|
}
|
||||||
|
b.WriteString(fmt.Sprintf(" peer %s: allowed=%s handshake=%s rx=%d tx=%d\n",
|
||||||
|
keyPrefix, p.AllowedIPs, age, p.TransferRx, p.TransferTx))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return b.String()
|
||||||
|
}
|
||||||
|
|
||||||
|
func buildSystemContext(data *ClusterData) string {
|
||||||
|
var b strings.Builder
|
||||||
|
for host, nd := range data.Nodes {
|
||||||
|
if nd.System == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
s := nd.System
|
||||||
|
memPct := 0
|
||||||
|
if s.MemTotalMB > 0 {
|
||||||
|
memPct = s.MemUsedMB * 100 / s.MemTotalMB
|
||||||
|
}
|
||||||
|
b.WriteString(fmt.Sprintf("### %s\n", host))
|
||||||
|
b.WriteString(fmt.Sprintf(" mem=%d%% (%d/%dMB) disk=%d%% load=%s cpus=%d\n",
|
||||||
|
memPct, s.MemUsedMB, s.MemTotalMB, s.DiskUsePct, s.LoadAvg, s.CPUCount))
|
||||||
|
b.WriteString(fmt.Sprintf(" oom=%d swap=%d/%dMB inodes=%d%% ufw=%v user=%s panics=%d\n",
|
||||||
|
s.OOMKills, s.SwapUsedMB, s.SwapTotalMB, s.InodePct, s.UFWActive, s.ProcessUser, s.PanicCount))
|
||||||
|
if len(s.FailedUnits) > 0 {
|
||||||
|
b.WriteString(fmt.Sprintf(" failed_units: %s\n", strings.Join(s.FailedUnits, ", ")))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return b.String()
|
||||||
|
}
|
||||||
|
|
||||||
|
func buildNetworkContext(data *ClusterData) string {
|
||||||
|
var b strings.Builder
|
||||||
|
for host, nd := range data.Nodes {
|
||||||
|
if nd.Network == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
n := nd.Network
|
||||||
|
b.WriteString(fmt.Sprintf("### %s\n", host))
|
||||||
|
b.WriteString(fmt.Sprintf(" internet=%v default_route=%v wg_route=%v\n",
|
||||||
|
n.InternetReachable, n.DefaultRoute, n.WGRouteExists))
|
||||||
|
b.WriteString(fmt.Sprintf(" tcp: established=%d time_wait=%d retransmit=%.2f%%\n",
|
||||||
|
n.TCPEstablished, n.TCPTimeWait, n.TCPRetransRate))
|
||||||
|
if len(n.PingResults) > 0 {
|
||||||
|
var failed []string
|
||||||
|
for ip, ok := range n.PingResults {
|
||||||
|
if !ok {
|
||||||
|
failed = append(failed, ip)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(failed) > 0 {
|
||||||
|
b.WriteString(fmt.Sprintf(" mesh_ping_failed: %s\n", strings.Join(failed, ", ")))
|
||||||
|
} else {
|
||||||
|
b.WriteString(fmt.Sprintf(" mesh_ping: all %d peers OK\n", len(n.PingResults)))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return b.String()
|
||||||
|
}
|
||||||
|
|
||||||
|
func buildNamespaceContext(data *ClusterData) string {
|
||||||
|
var b strings.Builder
|
||||||
|
for host, nd := range data.Nodes {
|
||||||
|
if len(nd.Namespaces) == 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
b.WriteString(fmt.Sprintf("### %s (%d namespaces)\n", host, len(nd.Namespaces)))
|
||||||
|
for _, ns := range nd.Namespaces {
|
||||||
|
b.WriteString(fmt.Sprintf(" ns=%s port_base=%d rqlite=%v(state=%s,ready=%v) olric=%v gateway=%v(status=%d)\n",
|
||||||
|
ns.Name, ns.PortBase, ns.RQLiteUp, ns.RQLiteState, ns.RQLiteReady, ns.OlricUp, ns.GatewayUp, ns.GatewayStatus))
|
||||||
|
}
|
||||||
|
}
|
||||||
return b.String()
|
return b.String()
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -160,6 +514,7 @@ func callOpenRouter(model, apiKey, prompt string) (string, error) {
|
|||||||
reqBody := openRouterRequest{
|
reqBody := openRouterRequest{
|
||||||
Model: model,
|
Model: model,
|
||||||
Messages: []openRouterMessage{
|
Messages: []openRouterMessage{
|
||||||
|
{Role: "system", Content: systemPrompt},
|
||||||
{Role: "user", Content: prompt},
|
{Role: "user", Content: prompt},
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
@ -220,10 +575,20 @@ func truncate(s string, max int) string {
|
|||||||
return s[:max] + "..."
|
return s[:max] + "..."
|
||||||
}
|
}
|
||||||
|
|
||||||
// PrintAnalysis writes the AI analysis to the output.
|
// PrintAnalysis writes the AI analysis to the output, one section per subsystem.
|
||||||
func PrintAnalysis(analysis *AnalysisResult, w io.Writer) {
|
func PrintAnalysis(result *AnalysisResult, w io.Writer) {
|
||||||
fmt.Fprintf(w, "\n## AI Analysis (%s)\n", analysis.Model)
|
fmt.Fprintf(w, "\n## AI Analysis (%s)\n", result.Model)
|
||||||
fmt.Fprintf(w, "%s\n", strings.Repeat("-", 70))
|
fmt.Fprintf(w, "%s\n", strings.Repeat("-", 70))
|
||||||
fmt.Fprintf(w, "%s\n", analysis.Analysis)
|
|
||||||
fmt.Fprintf(w, "\n(Analysis took %.1fs)\n", analysis.Duration.Seconds())
|
for _, sa := range result.Analyses {
|
||||||
|
fmt.Fprintf(w, "\n### %s\n\n", strings.ToUpper(sa.Subsystem))
|
||||||
|
if sa.Error != nil {
|
||||||
|
fmt.Fprintf(w, "Analysis failed: %v\n", sa.Error)
|
||||||
|
} else {
|
||||||
|
fmt.Fprintf(w, "%s\n", sa.Analysis)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fmt.Fprintf(w, "\n%s\n", strings.Repeat("-", 70))
|
||||||
|
fmt.Fprintf(w, "(Analysis took %.1fs — %d subsystems analyzed)\n", result.Duration.Seconds(), len(result.Analyses))
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user