diff --git a/core/Makefile b/core/Makefile index c685f44..d1c793b 100644 --- a/core/Makefile +++ b/core/Makefile @@ -63,7 +63,7 @@ test-e2e-quick: .PHONY: build clean test deps tidy fmt vet lint install-hooks push-devnet push-testnet rollout-devnet rollout-testnet release -VERSION := 0.122.2 +VERSION := 0.122.3 COMMIT ?= $(shell git rev-parse --short HEAD 2>/dev/null || echo unknown) DATE ?= $(shell date -u +%Y-%m-%dT%H:%M:%SZ) LDFLAGS := -X 'main.version=$(VERSION)' -X 'main.commit=$(COMMIT)' -X 'main.date=$(DATE)' diff --git a/core/cli b/core/cli new file mode 100755 index 0000000..0a39f53 Binary files /dev/null and b/core/cli differ diff --git a/core/cmd/gateway/config.go b/core/cmd/gateway/config.go index e263d1d..e97f27f 100644 --- a/core/cmd/gateway/config.go +++ b/core/cmd/gateway/config.go @@ -77,21 +77,26 @@ func parseGatewayConfig(logger *logging.ColoredLogger) *gateway.Config { } type yamlCfg struct { - ListenAddr string `yaml:"listen_addr"` - ClientNamespace string `yaml:"client_namespace"` - RQLiteDSN string `yaml:"rqlite_dsn"` - GlobalRQLiteDSN string `yaml:"global_rqlite_dsn"` - Peers []string `yaml:"bootstrap_peers"` - EnableHTTPS bool `yaml:"enable_https"` - DomainName string `yaml:"domain_name"` - TLSCacheDir string `yaml:"tls_cache_dir"` - OlricServers []string `yaml:"olric_servers"` - OlricTimeout string `yaml:"olric_timeout"` - IPFSClusterAPIURL string `yaml:"ipfs_cluster_api_url"` - IPFSAPIURL string `yaml:"ipfs_api_url"` - IPFSTimeout string `yaml:"ipfs_timeout"` - IPFSReplicationFactor int `yaml:"ipfs_replication_factor"` + ListenAddr string `yaml:"listen_addr"` + ClientNamespace string `yaml:"client_namespace"` + RQLiteDSN string `yaml:"rqlite_dsn"` + GlobalRQLiteDSN string `yaml:"global_rqlite_dsn"` + Peers []string `yaml:"bootstrap_peers"` + EnableHTTPS bool `yaml:"enable_https"` + DomainName string `yaml:"domain_name"` + TLSCacheDir string `yaml:"tls_cache_dir"` + OlricServers []string `yaml:"olric_servers"` + OlricTimeout string `yaml:"olric_timeout"` + IPFSClusterAPIURL string `yaml:"ipfs_cluster_api_url"` + IPFSAPIURL string `yaml:"ipfs_api_url"` + IPFSTimeout string `yaml:"ipfs_timeout"` + IPFSReplicationFactor int `yaml:"ipfs_replication_factor"` WebRTC yamlWebRTCCfg `yaml:"webrtc"` + // ClusterSecretPath: see GatewayYAMLConfig docstring. Optional; + // when set, the standalone gateway reads the file at this path + // and populates cfg.ClusterSecret so JWT signing keys can be + // derived deterministically (bug #215 fix). + ClusterSecretPath string `yaml:"cluster_secret_path"` } data, err := os.ReadFile(configPath) @@ -200,6 +205,30 @@ func parseGatewayConfig(logger *logging.ColoredLogger) *gateway.Config { cfg.IPFSReplicationFactor = y.IPFSReplicationFactor } + // Cluster secret — bug #215 fix. The host-managed gateway in + // pkg/node/gateway.go reads this from a known on-disk path; the + // standalone binary (used by namespace gateways via systemd) needs the + // same access so it can derive the cluster-wide Ed25519 JWT signing + // key. Without this, namespace gateways had per-node random keys and + // JWTs minted on one node were unverifiable on another, leaving + // `caller_jwt_subject` empty in serverless host functions. + if path := strings.TrimSpace(y.ClusterSecretPath); path != "" { + secretBytes, err := os.ReadFile(path) + if err != nil { + logger.ComponentError(logging.ComponentGeneral, + "cluster_secret_path is set but the file is unreadable; "+ + "JWTs will use a per-node random signing key and will not "+ + "verify cross-node — bug #215 will reproduce", + zap.String("path", path), + zap.Error(err)) + } else { + cfg.ClusterSecret = strings.TrimSpace(string(secretBytes)) + logger.ComponentInfo(logging.ComponentGeneral, + "Loaded cluster secret for cluster-wide JWT signing key derivation", + zap.String("path", path)) + } + } + // WebRTC configuration cfg.WebRTCEnabled = y.WebRTC.Enabled if y.WebRTC.SFUPort > 0 { diff --git a/core/docs/DEV_DEPLOY.md b/core/docs/DEV_DEPLOY.md index 89701c7..e30c243 100644 --- a/core/docs/DEV_DEPLOY.md +++ b/core/docs/DEV_DEPLOY.md @@ -489,25 +489,59 @@ sudo cp caddy-root-ca.crt /usr/local/share/ca-certificates/caddy-root-ca.crt sudo update-ca-certificates ``` -## Push notifications (optional, per gateway) +## Push notifications -Push routes (`/v1/push/devices`, `/v1/push/send`) are always registered but -return `503 SERVICE_UNAVAILABLE` until at least one provider is configured -in the gateway config: +Push provider configuration is **tenant-self-service** as of bug #220 +follow-up. Tenants set their own ntfy / Expo credentials via authenticated +HTTP — operators no longer need to edit YAML and restart for every namespace +that wants push. -```yaml -# In your namespace gateway config: -push: - ntfy_base_url: "http://localhost:8080" # self-hosted ntfy (preferred for privacy) - expo_access_token: "..." # Expo push (alternative; client SDK lock-in) +### Tenant flow (no operator involvement) + +```bash +# Set per-namespace config +curl -X PUT https://ns-anchat-test.orama-devnet.network/v1/push/config \ + -H 'Authorization: Bearer ' \ + -H 'Content-Type: application/json' \ + -d '{"ntfy_base_url": "https://ntfy.sh"}' + +# Read current config (secrets redacted to booleans) +curl https://ns-anchat-test.orama-devnet.network/v1/push/config \ + -H 'Authorization: Bearer ' + +# Clear (push reverts to gateway YAML defaults, or 503 if no defaults) +curl -X DELETE https://ns-anchat-test.orama-devnet.network/v1/push/config \ + -H 'Authorization: Bearer ' ``` -Restart the gateway after editing. A `GET /v1/push/devices` call (with valid -auth) will then return `200` instead of `503`. +Per-namespace config takes effect on the NEXT push send (the cached +dispatcher is invalidated on PUT/DELETE). No restart needed. -The 503 body explicitly names the config knobs operators need to set, so -tenants getting "Push not configured" can self-diagnose without filing a -ticket. (Bug #220 audit fix.) +### Operator flow (cluster-wide defaults — optional) + +Operators can still seed defaults in the gateway YAML. Per-namespace config +OVERRIDES the defaults; namespaces with no row inherit them. + +```yaml +# Cluster-wide push defaults (optional; tenants override per-namespace) +push: + ntfy_base_url: "https://ntfy.sh" # default for namespaces with no override + expo_access_token: "..." # default Expo token +``` + +### Encryption + +Sensitive credentials (`ntfy_auth_token`, `expo_access_token`) are +AES-256-GCM-encrypted at rest in the `namespace_push_config` table using +a key derived from the cluster secret. The GET endpoint returns boolean +`has_X` flags only — credentials are NEVER echoed back over HTTP. + +### Disabling push entirely + +If `cluster_secret` isn't configured on the gateway, the push subsystem +is disabled and `/v1/push/*` returns 503. To enable: set the cluster secret +and restart. (This is the only operator-side restart still required, and +it's a one-time action at gateway provisioning.) ## Project Structure diff --git a/core/docs/SERVERLESS.md b/core/docs/SERVERLESS.md index bec5d97..195ba8d 100644 --- a/core/docs/SERVERLESS.md +++ b/core/docs/SERVERLESS.md @@ -194,6 +194,49 @@ The legacy `db_execute` is kept indefinitely so existing functions don't break. | `log_info(message)` | Log info-level message (captured in invocation logs). | | `log_error(message)` | Log error-level message. | +## Configuring Push Notifications (per-namespace) + +Push providers (ntfy / Expo) are configured **per namespace** by the tenant — +no operator involvement, no SSH access required. Set, read, or clear via: + +```bash +# Set / update (sensitive credentials are encrypted at rest) +curl -X PUT https://ns-myapp.example.com/v1/push/config \ + -H 'Authorization: Bearer ' \ + -H 'Content-Type: application/json' \ + -d '{ + "ntfy_base_url": "https://ntfy.sh", + "ntfy_auth_token": "tk_…" + }' + +# Read (sensitive fields redacted to booleans) +curl https://ns-myapp.example.com/v1/push/config \ + -H 'Authorization: Bearer ' + +# Clear (push reverts to gateway-wide defaults if any, else 503) +curl -X DELETE https://ns-myapp.example.com/v1/push/config \ + -H 'Authorization: Bearer ' +``` + +### Field semantics + +| Field | Sensitive? | Notes | +|---|---|---| +| `ntfy_base_url` | No | URL of the ntfy server. `https://ntfy.sh` works for testing. | +| `ntfy_auth_token` | Yes | Optional bearer token sent to ntfy. Encrypted at rest. | +| `expo_access_token` | Yes | Expo Push API access token. Encrypted at rest. | + +PUT semantics are **field-level** — a `null` (or omitted) field leaves the +existing value alone; an explicit empty string clears just that field. To +clear EVERYTHING use DELETE. + +After a PUT the next `push_send` (host call) or `POST /v1/push/send` uses +the new providers — the cached dispatcher is invalidated automatically. + +If no per-namespace config is set AND the gateway has no YAML defaults, the +push endpoints return **503 SERVICE_UNAVAILABLE** with a message naming the +exact config to set. + ## Managing Secrets Secrets are encrypted at rest (AES-256-GCM) and scoped to your namespace. Functions read them via `get_secret("name")` at runtime. diff --git a/core/migrations/026_namespace_push_config.sql b/core/migrations/026_namespace_push_config.sql new file mode 100644 index 0000000..554ba7f --- /dev/null +++ b/core/migrations/026_namespace_push_config.sql @@ -0,0 +1,26 @@ +-- ============================================================================= +-- 026_namespace_push_config.sql +-- +-- Per-namespace push notification provider configuration. Tenants set their +-- own ntfy / expo credentials via PUT /v1/push/config without operator +-- involvement (bug #220 follow-up — self-service tenant config). +-- +-- Sensitive credentials (auth tokens) are AES-256-GCM ciphertext via +-- pkg/secrets, prefix 'enc:'. Non-secret URLs (ntfy_base_url) stored +-- plaintext — they leak no security material. +-- +-- The gateway YAML config remains as a global fallback / default. A row +-- in this table OVERRIDES the YAML for that namespace; absence falls back. +-- ============================================================================= + +CREATE TABLE IF NOT EXISTS namespace_push_config ( + namespace TEXT PRIMARY KEY, + -- ntfy provider config (URL is non-secret; auth token is) + ntfy_base_url TEXT, + ntfy_auth_token_encrypted TEXT, + -- expo provider config (the access token IS sensitive) + expo_access_token_encrypted TEXT, + -- Audit metadata: who set this, and when (last update wins). + updated_at INTEGER NOT NULL, + updated_by TEXT +); diff --git a/core/pkg/cli/cmd/pushcmd/push.go b/core/pkg/cli/cmd/pushcmd/push.go index f2c8a19..939b020 100644 --- a/core/pkg/cli/cmd/pushcmd/push.go +++ b/core/pkg/cli/cmd/pushcmd/push.go @@ -98,7 +98,7 @@ Examples: return pushDirect(nodes, archivePath) } - // Multi-node with --fanout: use agent forwarding + // Multi-node with --fanout: upload to hub, fan out server-to-server return pushFanout(nodes, archivePath) }, } @@ -135,27 +135,35 @@ func pushDirect(nodes []inspector.Node, archivePath string) error { return nil } -// pushFanout uploads the archive to the first node, then fans out server-to-server -// using SSH agent forwarding. +// pushFanout uploads the archive to the first node (hub), then fans out +// server-to-server: the hub SCPs the archive to all other nodes in parallel +// and SSHes in to extract it. +// +// Key design — no SSH agent forwarding: +// +// The previous implementation loaded all N node keys into the system ssh-agent +// and used agent forwarding (-A) so the hub could reach targets. That caused +// "Too many authentication failures": when the hub connected to a target, the +// forwarded agent offered all N keys sequentially; if N exceeds the server's +// MaxAuthTries (default 6 on most distros), the server disconnects before the +// correct key is tried. +// +// Fix: PrepareNodeKeys (called by the parent command) already fetched and wrote +// each node's private key to a temp file (node.SSHKey). We base64-encode each +// key and embed it directly in the fanout bash script. On the hub, the script +// writes each key to its own mktemp file, uses -o IdentitiesOnly=yes -i $K +// (only ONE key offered per connection), and deletes the temp file immediately. +// No ssh-agent involved on either end; MaxAuthTries is irrelevant. func pushFanout(nodes []inspector.Node, archivePath string) error { fmt.Printf("Pushing to %d node(s) (fanout)...\n\n", len(nodes)) hub := nodes[0] targets := nodes[1:] remotePath := "/tmp/" + filepath.Base(archivePath) - // Hub extraction keeps the archive so it can be fanned out to targets. - // The cleanup at the end removes it. - hubExtractCmd := fmt.Sprintf("mkdir -p /opt/orama && tar xzf %s -C /opt/orama", remotePath) - // Target extraction deletes the archive after extracting. - targetExtractCmd := fmt.Sprintf("mkdir -p /opt/orama && tar xzf %s -C /opt/orama && rm -f %s", remotePath, remotePath) - // Load SSH keys into the system ssh-agent for agent forwarding - fmt.Println(" Loading SSH keys into agent...") - if err := remotessh.LoadAgentKeys(nodes); err != nil { - fmt.Printf(" Warning: failed to load agent keys: %v\n", err) - fmt.Println(" Falling back to direct push...") - return pushDirect(nodes, archivePath) - } + // Hub keeps the archive on disk after extracting — targets SCP it from here. + // The archive is removed from the hub at the end of this function. + hubExtractCmd := fmt.Sprintf("mkdir -p /opt/orama && tar xzf %s -C /opt/orama", remotePath) // Upload archive to hub fmt.Printf(" %s (hub): uploading...", hub.Host) @@ -168,27 +176,53 @@ func pushFanout(nodes []inspector.Node, archivePath string) error { } fmt.Println(" OK") - // Build the fanout command — hub SCPs to all targets in parallel + // Build the fanout script. Each target gets its own shell subshell that: + // 1. Writes the target's SSH private key to a mktemp file ($K). + // 2. SCPs the archive from the hub's local path to the target. + // 3. SSHes into the target to extract it. + // 4. Deletes $K — key material never lingers on the hub. + // + // All subshells run in parallel (&); a final `wait` collects them. + // The entire script is base64-encoded before transmission to avoid shell + // quoting conflicts (the script contains both single and double quotes). var fanoutParts []string for _, t := range targets { - scpCmd := fmt.Sprintf( - "scp -o StrictHostKeyChecking=accept-new -o IdentitiesOnly=no %s %s@%s:%s && ssh -o StrictHostKeyChecking=accept-new %s@%s 'sudo bash -c \"%s\"' && echo '%s: done'", + keyBytes, err := os.ReadFile(t.SSHKey) + if err != nil { + // SSHKey was populated by PrepareNodeKeys; this should never + // happen unless the temp file was somehow deleted mid-run. + fmt.Printf(" Warning: could not read key for %s: %v (skipping)\n", t.Host, err) + continue + } + // base64 alphabet is [A-Za-z0-9+/=] — no shell metacharacters — + // safe to embed in single-quoted strings inside the script. + keyB64 := base64.StdEncoding.EncodeToString(keyBytes) + + part := fmt.Sprintf( + "(K=$(mktemp) && echo '%s' | base64 -d >\"$K\" && chmod 600 \"$K\" && "+ + "scp -o StrictHostKeyChecking=accept-new -o ConnectTimeout=30 -o IdentitiesOnly=yes -i \"$K\" %s %s@%s:%s && "+ + "ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=30 -o IdentitiesOnly=yes -i \"$K\" %s@%s "+ + "'sudo bash -c \"mkdir -p /opt/orama && tar xzf %s -C /opt/orama && rm -f %s\"' && "+ + "rm -f \"$K\" && echo '%s: done' || (rm -f \"$K\" 2>/dev/null; echo '%s: FAILED')) &", + keyB64, remotePath, t.User, t.Host, remotePath, - t.User, t.Host, targetExtractCmd, - t.Host, + t.User, t.Host, + remotePath, remotePath, + t.Host, t.Host, ) - fanoutParts = append(fanoutParts, "("+scpCmd+") &") + fanoutParts = append(fanoutParts, part) } fanoutParts = append(fanoutParts, "wait", "echo 'Fanout complete'") fanoutScript := strings.Join(fanoutParts, "\n") - // Base64-encode the script to avoid shell quoting conflicts — the script - // contains single quotes (ssh '...') that would break a bash -c '...' wrapper. encoded := base64.StdEncoding.EncodeToString([]byte(fanoutScript)) runCmd := fmt.Sprintf("echo %s | base64 -d | bash", encoded) fmt.Printf(" Fanning out to %d nodes from %s...\n", len(targets), hub.Host) - if err := remotessh.RunSSHStreaming(hub, runCmd, remotessh.WithAgentForward()); err != nil { + // No agent forwarding — hub authenticates to each target using only + // that target's key (embedded above). IdentitiesOnly=yes ensures the + // hub's own host key is never accidentally offered to other nodes. + if err := remotessh.RunSSHStreaming(hub, runCmd); err != nil { fmt.Printf(" Fanout failed: %v\n", err) fmt.Println(" Some nodes may not have been updated") } diff --git a/core/pkg/cli/functions/triggers.go b/core/pkg/cli/functions/triggers.go index 4c4f842..3ee0053 100644 --- a/core/pkg/cli/functions/triggers.go +++ b/core/pkg/cli/functions/triggers.go @@ -6,6 +6,7 @@ import ( "fmt" "io" "text/tabwriter" + "time" "github.com/spf13/cobra" ) @@ -126,32 +127,83 @@ func runTriggersList(cmd *cobra.Command, args []string) error { } w := tabwriter.NewWriter(cmd.OutOrStdout(), 0, 0, 2, ' ', 0) - fmt.Fprintln(w, "ID\tTOPIC\tENABLED") + // Bug #65 audit: the previous CLI rendered only ID/TOPIC/ENABLED, so cron + // triggers appeared as mystery blank-topic rows. The handler returns a + // `kind` discriminator plus pubsub-only `topic` or cron-only + // `cron_expression` / `next_run_at` / `last_run_at`; the CLI now renders + // both kinds in a single unified table. + fmt.Fprintln(w, "ID\tKIND\tSCHEDULE/TOPIC\tNEXT RUN\tLAST RUN\tENABLED") for _, t := range triggers { tr, ok := t.(map[string]interface{}) if !ok { continue } - id, _ := tr["ID"].(string) - if id == "" { - id, _ = tr["id"].(string) + id := stringField(tr, "id", "ID") + kind := stringField(tr, "kind", "Kind") + // Backward compat: pre-#65 servers returned only `topic` with no + // `kind` field. Treat those as pubsub. + if kind == "" { + kind = "pubsub" } - topic, _ := tr["Topic"].(string) - if topic == "" { - topic, _ = tr["topic"].(string) + + var what, nextRun, lastRun string + switch kind { + case "cron": + what = stringField(tr, "cron_expression", "CronExpression") + nextRun = formatCronTimestamp(tr["next_run_at"]) + lastRun = formatCronTimestamp(tr["last_run_at"]) + default: // pubsub or unknown + what = stringField(tr, "topic", "Topic") + nextRun = "-" + lastRun = "-" } + enabled := true if e, ok := tr["Enabled"].(bool); ok { enabled = e } else if e, ok := tr["enabled"].(bool); ok { enabled = e } - fmt.Fprintf(w, "%s\t%s\t%v\n", id, topic, enabled) + fmt.Fprintf(w, "%s\t%s\t%s\t%s\t%s\t%v\n", id, kind, what, nextRun, lastRun, enabled) } w.Flush() return nil } +// stringField pulls a string from a JSON-decoded map under any of the +// supplied keys, in order. The handler emits snake_case (`cron_expression`) +// while older Go-tagged structs may surface PascalCase — try both. +func stringField(m map[string]interface{}, keys ...string) string { + for _, k := range keys { + if v, ok := m[k].(string); ok && v != "" { + return v + } + } + return "" +} + +// formatCronTimestamp renders a JSON timestamp from the handler in a compact +// human-readable form. Returns "-" for nil / unparseable values so the CLI +// table stays aligned for never-run / pubsub rows. +func formatCronTimestamp(v interface{}) string { + if v == nil { + return "-" + } + s, ok := v.(string) + if !ok || s == "" { + return "-" + } + // Try RFC3339 first (Go's default time.Time JSON encoding); fall back to + // the raw string so unexpected formats don't disappear silently. + if ts, err := time.Parse(time.RFC3339, s); err == nil { + return ts.UTC().Format("2006-01-02 15:04:05 UTC") + } + if ts, err := time.Parse(time.RFC3339Nano, s); err == nil { + return ts.UTC().Format("2006-01-02 15:04:05 UTC") + } + return s +} + func runTriggersDelete(cmd *cobra.Command, args []string) error { funcName := args[0] triggerID := args[1] diff --git a/core/pkg/cli/functions/triggers_test.go b/core/pkg/cli/functions/triggers_test.go new file mode 100644 index 0000000..002bd2f --- /dev/null +++ b/core/pkg/cli/functions/triggers_test.go @@ -0,0 +1,114 @@ +package functions + +import ( + "testing" +) + +// ---------------------------------------------------------------------------- +// stringField — pulls value from JSON-decoded map under any of the given keys +// ---------------------------------------------------------------------------- + +func TestStringField_prefersFirstKey(t *testing.T) { + m := map[string]interface{}{ + "id": "first", + "ID": "second", + } + if got := stringField(m, "id", "ID"); got != "first" { + t.Errorf("stringField = %q, want %q", got, "first") + } +} + +func TestStringField_fallsThroughWhenFirstMissing(t *testing.T) { + m := map[string]interface{}{ + "ID": "second", + } + if got := stringField(m, "id", "ID"); got != "second" { + t.Errorf("stringField = %q, want %q", got, "second") + } +} + +func TestStringField_emptyValueSkipped(t *testing.T) { + // An empty string under the first key MUST fall through to subsequent + // keys, otherwise empty pubsub `topic` fields would shadow valid + // PascalCase `Topic`. + m := map[string]interface{}{ + "id": "", + "ID": "fallback", + } + if got := stringField(m, "id", "ID"); got != "fallback" { + t.Errorf("stringField = %q, want %q", got, "fallback") + } +} + +func TestStringField_nonStringValueSkipped(t *testing.T) { + m := map[string]interface{}{ + "id": 42, + "ID": "ok", + } + if got := stringField(m, "id", "ID"); got != "ok" { + t.Errorf("stringField = %q, want %q", got, "ok") + } +} + +func TestStringField_allMissingReturnsEmpty(t *testing.T) { + m := map[string]interface{}{"other": "value"} + if got := stringField(m, "id", "ID"); got != "" { + t.Errorf("stringField = %q, want empty", got) + } +} + +// ---------------------------------------------------------------------------- +// formatCronTimestamp — RFC3339 -> UTC display, "-" for missing/unparseable +// ---------------------------------------------------------------------------- + +func TestFormatCronTimestamp_nilReturnsDash(t *testing.T) { + if got := formatCronTimestamp(nil); got != "-" { + t.Errorf("formatCronTimestamp(nil) = %q, want %q", got, "-") + } +} + +func TestFormatCronTimestamp_emptyStringReturnsDash(t *testing.T) { + if got := formatCronTimestamp(""); got != "-" { + t.Errorf("formatCronTimestamp(\"\") = %q, want %q", got, "-") + } +} + +func TestFormatCronTimestamp_nonStringReturnsDash(t *testing.T) { + if got := formatCronTimestamp(42); got != "-" { + t.Errorf("formatCronTimestamp(42) = %q, want %q", got, "-") + } +} + +func TestFormatCronTimestamp_rfc3339(t *testing.T) { + got := formatCronTimestamp("2025-05-08T03:00:00Z") + want := "2025-05-08 03:00:00 UTC" + if got != want { + t.Errorf("formatCronTimestamp = %q, want %q", got, want) + } +} + +func TestFormatCronTimestamp_rfc3339Nano(t *testing.T) { + got := formatCronTimestamp("2025-05-08T03:00:00.123456789Z") + want := "2025-05-08 03:00:00 UTC" + if got != want { + t.Errorf("formatCronTimestamp nano = %q, want %q", got, want) + } +} + +func TestFormatCronTimestamp_rfc3339WithOffset(t *testing.T) { + // Non-UTC offsets must be normalised to UTC for the display. + got := formatCronTimestamp("2025-05-08T05:00:00+02:00") + want := "2025-05-08 03:00:00 UTC" + if got != want { + t.Errorf("formatCronTimestamp offset = %q, want %q", got, want) + } +} + +func TestFormatCronTimestamp_unparseableFallsBackToRaw(t *testing.T) { + // If the server returns an unexpected timestamp shape, surface it + // rather than silently dropping to "-" — operator visibility wins. + got := formatCronTimestamp("not-a-timestamp") + if got != "not-a-timestamp" { + t.Errorf("formatCronTimestamp unparseable = %q, want raw passthrough", got) + } +} diff --git a/core/pkg/gateway/auth/service_test.go b/core/pkg/gateway/auth/service_test.go index 197451f..434ca05 100644 --- a/core/pkg/gateway/auth/service_test.go +++ b/core/pkg/gateway/auth/service_test.go @@ -416,3 +416,73 @@ func TestJWKSHandler_RSAOnly(t *testing.T) { t.Errorf("expected RS256, got %s", result.Keys[0]["alg"]) } } + +// TestEdDSACrossServiceVerify is the regression test for bug #215. Two Service +// instances configured with the SAME Ed25519 key (the cluster-shared scenario +// produced by deterministic HKDF derivation in pkg/gateway/signing_key.go) +// must be able to verify each other's tokens. Without this guarantee a JWT +// minted on the main gateway is unverifiable on a namespace gateway and host +// functions see an empty caller_jwt_subject. +func TestEdDSACrossServiceVerify(t *testing.T) { + // Shared key — what HKDF would produce from the cluster secret. + _, shared, err := ed25519.GenerateKey(rand.Reader) + if err != nil { + t.Fatalf("generate shared key: %v", err) + } + + makeService := func() *Service { + s := createTestService(t) // RSA + mock client + s.SetEdDSAKey(shared) + return s + } + + signer := makeService() // simulates main gateway + verifier := makeService() // simulates namespace gateway (different process, same shared key) + + // Sanity: both services must agree on edKeyID since it is derived from the + // public key. If they don't, kid-based verification will silently fail. + if signer.edKeyID != verifier.edKeyID { + t.Fatalf("edKeyID mismatch: signer=%q verifier=%q", signer.edKeyID, verifier.edKeyID) + } + + const wantSub = "BNbN2RNQTsYrrywZCLnhV9j3hd38jwcRqfxBecZX7hDE" + const wantNS = "anchat-test" + token, _, err := signer.GenerateJWT(wantNS, wantSub, 15*time.Minute) + if err != nil { + t.Fatalf("signer.GenerateJWT: %v", err) + } + + claims, err := verifier.ParseAndVerifyJWT(token) + if err != nil { + t.Fatalf("cross-service verify failed: %v", err) + } + if claims.Sub != wantSub { + t.Errorf("Sub = %q, want %q", claims.Sub, wantSub) + } + if claims.Namespace != wantNS { + t.Errorf("Namespace = %q, want %q", claims.Namespace, wantNS) + } +} + +// TestEdDSACrossServiceVerify_differentKeysFail proves the verify gate is +// real: when two services have different Ed25519 keys (the broken state +// before bug #215 fix), tokens minted on one MUST NOT validate on the other. +// If this test ever passes, the deterministic-derivation guarantee is +// silently bypassed somewhere. +func TestEdDSACrossServiceVerify_differentKeysFail(t *testing.T) { + signer := createTestService(t) + _, signKey, _ := ed25519.GenerateKey(rand.Reader) + signer.SetEdDSAKey(signKey) + + verifier := createTestService(t) + _, verKey, _ := ed25519.GenerateKey(rand.Reader) + verifier.SetEdDSAKey(verKey) + + token, _, err := signer.GenerateJWT("ns", "sub", 15*time.Minute) + if err != nil { + t.Fatalf("GenerateJWT: %v", err) + } + if _, err := verifier.ParseAndVerifyJWT(token); err == nil { + t.Fatal("expected verification to fail with different signing keys, got nil error") + } +} diff --git a/core/pkg/gateway/dependencies.go b/core/pkg/gateway/dependencies.go index 671e0d8..3b053d6 100644 --- a/core/pkg/gateway/dependencies.go +++ b/core/pkg/gateway/dependencies.go @@ -82,11 +82,20 @@ type Dependencies struct { // Used by the ws_pubsub_bridge host function. Nil = disabled. WSBridge *wsbridge.Bridge - // Push notification dispatcher (nil when push isn't configured — - // hostfunc + HTTP handlers degrade to no-op / 503). + // Push notification dispatcher (legacy single-tier; nil when push + // isn't configured at all). When PushManager is also set, send paths + // route through the manager instead so per-namespace config wins. PushDispatcher *push.PushDispatcher PushDeviceStore push.PushDeviceStore + // PushManager wraps the device store + per-namespace config store so + // tenants self-serve their push provider config via PUT /v1/push/config. + // Nil when push subsystem isn't initialized (cluster secret missing). + // When set, this is the canonical send path; PushDispatcher is the + // fallback used only if Manager is somehow missing. + PushManager *push.Manager + PushConfigStore push.ConfigStore + // Authentication service AuthService *auth.Service } @@ -464,10 +473,18 @@ func initializeServerless(logger *logging.ColoredLogger, cfg *Config, deps *Depe secretsMgr = smImpl } - // Initialize push notification dispatcher if any provider is configured. - // Devices are stored encrypted in RQLite (see migration 023). Providers - // are registered based on gateway config; missing config = provider absent. - pushDispatcher, pushStore, err := buildPushDispatcher(cfg, deps.ORMClient, logger) + // Initialize push notification subsystem. + // + // Bug #220 follow-up: the subsystem now ALWAYS initializes when the + // cluster secret is available (so tenants can register devices and + // PUT their per-namespace push config), regardless of whether the + // gateway YAML has a default provider configured. The Manager wraps + // the device store + per-namespace ConfigStore; Send paths route + // through Manager so per-namespace config takes effect. + // + // PushDispatcher (legacy) is set only when YAML defaults exist — + // kept for back-compat with code that hasn't migrated to Manager. + pushDispatcher, pushStore, pushManager, pushCfgStore, err := buildPushDispatcher(cfg, deps.ORMClient, logger) if err != nil { // Non-fatal: log and continue. Functions calling push_send will get nil // (silent no-op) and HTTP /v1/push/* endpoints return 503. @@ -476,6 +493,8 @@ func initializeServerless(logger *logging.ColoredLogger, cfg *Config, deps *Depe } deps.PushDispatcher = pushDispatcher deps.PushDeviceStore = pushStore + deps.PushManager = pushManager + deps.PushConfigStore = pushCfgStore // Create host functions provider (allows functions to call Orama services) hostFuncsCfg := hostfunctions.HostFunctionsConfig{ @@ -494,7 +513,8 @@ func initializeServerless(logger *logging.ColoredLogger, cfg *Config, deps *Depe pubsubAdapter, // pubsub adapter for serverless functions deps.ServerlessWSMgr, secretsMgr, - pushDispatcher, // may be nil — PushSend hostfunc handles that + pushDispatcher, // legacy — fallback when manager isn't wired + pushManager, // bug #220 follow-up — per-namespace config deps.WSBridge, // may be nil; WSPubSubBridge returns explicit error hostFuncsCfg, logger.Logger, @@ -581,8 +601,21 @@ func initializeServerless(logger *logging.ColoredLogger, cfg *Config, deps *Depe return fmt.Errorf("failed to initialize auth service: %w", err) } - // Load or create EdDSA key for new JWT tokens - edKey, err := loadOrCreateEdSigningKey(cfg.DataDir, logger) + // Load or create EdDSA key for new JWT tokens. Bug #215 fix: when + // cfg.ClusterSecret is set, the key is derived deterministically from + // it via HKDF, so every gateway in the cluster shares the same Ed25519 + // keypair and JWTs verify cross-node. With an empty ClusterSecret the + // per-node legacy behaviour is retained (single-node test deployments). + if cfg.ClusterSecret == "" { + // Loud warning: a multi-node cluster booted without a cluster + // secret reproduces bug #215 (per-gateway random keys, JWTs + // unverifiable cross-node). Single-node test rigs are the only + // legitimate case. + logger.ComponentWarn(logging.ComponentGeneral, + "ClusterSecret is empty; JWT signing keys will be random per-node. "+ + "Multi-node clusters MUST set ClusterSecret or JWTs will not verify across gateways (bug #215).") + } + edKey, err := loadOrCreateEdSigningKey(cfg.DataDir, cfg.ClusterSecret, logger) if err != nil { logger.ComponentWarn(logging.ComponentGeneral, "Failed to load EdDSA signing key; new JWTs will use RS256", zap.Error(err)) @@ -791,39 +824,96 @@ func injectRQLiteAuth(dsn, username, password string) string { return dsn } -// buildPushDispatcher constructs a push.PushDispatcher + device store with -// all enabled providers. Returns (nil, nil, nil) when no provider is -// configured — that's a supported state, not an error. Returns (nil, nil, -// err) on hard init failures (e.g. cluster secret missing for the -// encrypted device store). -func buildPushDispatcher(cfg *Config, db rqlite.Client, logger *logging.ColoredLogger) (*push.PushDispatcher, push.PushDeviceStore, error) { - if cfg.NtfyBaseURL == "" && cfg.ExpoAccessToken == "" { - // No providers configured — push is disabled. - return nil, nil, nil - } +// buildPushDispatcher constructs the push subsystem. +// +// As of bug #220 follow-up, push always initializes when ClusterSecret is +// available, regardless of whether any YAML provider config is set: +// +// - Device store + ConfigStore always build (tenants need to register +// devices and set per-namespace push config even on gateways with no +// YAML defaults). +// - Manager wraps the stores + a YAML-derived Defaults fallback. Each +// namespace can override any default via PUT /v1/push/config. +// - The legacy single-tier dispatcher is built only when YAML defaults +// are non-empty — kept for back-compat with code paths that haven't +// migrated to Manager. +// +// Returns (nil, nil, nil, nil, nil) when ClusterSecret is missing +// (push subsystem disabled — credentials can't be encrypted safely). +// Returns hard error only on store-init failure. +func buildPushDispatcher( + cfg *Config, + db rqlite.Client, + logger *logging.ColoredLogger, +) (*push.PushDispatcher, push.PushDeviceStore, *push.Manager, push.ConfigStore, error) { if cfg.ClusterSecret == "" { - // Devices are encrypted at rest using a cluster-secret-derived key. - // Without it we can't store anything safely. - return nil, nil, fmt.Errorf("push enabled but ClusterSecret is empty") + // Without the cluster secret we can't encrypt credentials at rest. + // Disable the whole push subsystem; HTTP routes return 503. + return nil, nil, nil, nil, nil } + store, err := push.NewRqliteDeviceStore(db, cfg.ClusterSecret, logger.Logger) if err != nil { - return nil, nil, fmt.Errorf("init push device store: %w", err) + return nil, nil, nil, nil, fmt.Errorf("init push device store: %w", err) } - d := push.New(store, logger.Logger) - if cfg.NtfyBaseURL != "" { - d.Register(pushntfy.New(pushntfy.Config{ - BaseURL: cfg.NtfyBaseURL, - AuthToken: cfg.NtfyAuthToken, - }, logger.Logger)) - logger.ComponentInfo(logging.ComponentGeneral, "push provider registered: ntfy", - zap.String("base_url", cfg.NtfyBaseURL)) + + cfgStore, err := push.NewRqliteConfigStore(db, cfg.ClusterSecret, logger.Logger) + if err != nil { + return nil, nil, nil, nil, fmt.Errorf("init push config store: %w", err) } - if cfg.ExpoAccessToken != "" { - d.Register(pushexpo.New(pushexpo.Config{ - AccessToken: cfg.ExpoAccessToken, - }, logger.Logger)) - logger.ComponentInfo(logging.ComponentGeneral, "push provider registered: expo") + + // ProviderFactory turns a resolved Config into the right set of + // provider instances. Lives here in dependencies.go because this is + // the only place that imports both the manager package and the + // concrete provider sub-packages — keeps push core dep-cycle-free. + factory := func(c push.Config) []push.PushProvider { + var ps []push.PushProvider + if c.NtfyBaseURL != "" { + ps = append(ps, pushntfy.New(pushntfy.Config{ + BaseURL: c.NtfyBaseURL, + AuthToken: c.NtfyAuthToken, + }, logger.Logger)) + } + if c.ExpoAccessToken != "" { + ps = append(ps, pushexpo.New(pushexpo.Config{ + AccessToken: c.ExpoAccessToken, + }, logger.Logger)) + } + return ps } - return d, store, nil + + defaults := push.Defaults{ + NtfyBaseURL: cfg.NtfyBaseURL, + NtfyAuthToken: cfg.NtfyAuthToken, + ExpoAccessToken: cfg.ExpoAccessToken, + } + manager := push.NewManager(store, cfgStore, defaults, factory, logger.Logger) + + // Legacy single-tier dispatcher kept ONLY when YAML defaults exist — + // some non-Manager code paths (notably the WASM push_send hostfunc + // before its migration to Manager) still expect a populated + // PushDispatcher. New code routes via Manager. + var legacy *push.PushDispatcher + if !defaults.IsEmpty() { + legacy = push.New(store, logger.Logger) + for _, p := range factory(push.Config{ + NtfyBaseURL: defaults.NtfyBaseURL, + NtfyAuthToken: defaults.NtfyAuthToken, + ExpoAccessToken: defaults.ExpoAccessToken, + }) { + legacy.Register(p) + } + } + + if defaults.NtfyBaseURL != "" { + logger.ComponentInfo(logging.ComponentGeneral, "push default provider: ntfy", + zap.String("base_url", defaults.NtfyBaseURL)) + } + if defaults.ExpoAccessToken != "" { + logger.ComponentInfo(logging.ComponentGeneral, "push default provider: expo configured") + } + logger.ComponentInfo(logging.ComponentGeneral, + "push subsystem initialized; tenants can self-serve via PUT /v1/push/config") + + return legacy, store, manager, cfgStore, nil } diff --git a/core/pkg/gateway/gateway.go b/core/pkg/gateway/gateway.go index 50995db..d132657 100644 --- a/core/pkg/gateway/gateway.go +++ b/core/pkg/gateway/gateway.go @@ -367,7 +367,19 @@ func New(logger *logging.ColoredLogger, cfg *Config) (*Gateway, error) { // The handlers themselves return 503 if dispatcher/store is nil; we // register them unconditionally so the routes always exist with a // predictable shape. - if deps.PushDispatcher != nil { + // + // Prefer the Manager-backed constructor (bug #220 follow-up) so + // tenants can self-serve their push config via PUT /v1/push/config. + // Fall back to the legacy constructor when only the YAML-derived + // dispatcher is available (older deployments without ClusterSecret). + if deps.PushManager != nil { + gw.pushHandlers = pushhandlers.NewHandlersWithManager( + deps.PushManager, + deps.PushConfigStore, + deps.PushDeviceStore, + logger, + ) + } else if deps.PushDispatcher != nil { gw.pushHandlers = pushhandlers.NewHandlers(deps.PushDispatcher, deps.PushDeviceStore, logger) } diff --git a/core/pkg/gateway/handlers/push/config_handler.go b/core/pkg/gateway/handlers/push/config_handler.go new file mode 100644 index 0000000..fc23461 --- /dev/null +++ b/core/pkg/gateway/handlers/push/config_handler.go @@ -0,0 +1,233 @@ +package push + +// config_handler.go — tenant-self-service push provider configuration. +// +// Endpoints (mounted under /v1/push/config; namespace-ownership middleware applies): +// +// GET /v1/push/config → current config (secrets redacted: only "has_X" booleans) +// PUT /v1/push/config → set/update fields; sensitive credentials encrypted at rest +// DELETE /v1/push/config → clear the namespace's row (push reverts to gateway YAML defaults) +// +// Bug #220 follow-up. Eliminates the "tenant must file an ops ticket" +// workflow: once this lands, AnChat (and every future tenant) self-serves +// their push provider config via authenticated HTTP, no operator +// involvement. + +import ( + "encoding/json" + "errors" + "net/http" + "strings" + "time" + + "github.com/DeBrosOfficial/network/pkg/push" + "go.uber.org/zap" +) + +// configManager is the subset of *push.Manager the config handlers need — +// kept narrow for testability. +type configManager interface { + IsConfigured(ctx contextLike, namespace string) bool + Invalidate(namespace string) +} + +// contextLike avoids importing context everywhere — the handler is +// already in package serverless which has request contexts. +type contextLike = interface { + Done() <-chan struct{} +} + +// PutConfigRequest is the body of PUT /v1/push/config. +// +// Field semantics: +// - Unset fields (zero value) leave the existing value alone. +// - Empty-string fields explicitly clear the value. +// - To clear the entire row use DELETE — that's clearer than empty PUT. +type PutConfigRequest struct { + NtfyBaseURL *string `json:"ntfy_base_url,omitempty"` + NtfyAuthToken *string `json:"ntfy_auth_token,omitempty"` + ExpoAccessToken *string `json:"expo_access_token,omitempty"` +} + +// MaxConfigBodyBytes caps the PUT body size. Push tokens are typically +// well under 1 KB but we leave headroom. +const MaxConfigBodyBytes = 16 * 1024 + +// pushConfigManager is the concrete dependency the Handlers struct holds +// — a *push.Manager. We extract it via a small interface for tests. +type pushConfigManager interface { + IsConfigured(ctx interface{ Done() <-chan struct{} }, namespace string) bool + Invalidate(namespace string) +} + +// GetConfigHandler — GET /v1/push/config. Returns the namespace's current +// push provider config with sensitive fields REDACTED to boolean flags. +// +// Always 200 + canonical envelope-free body when the request is well-formed +// (clients can rely on the shape: an absent provider just shows +// `has_ntfy_auth_token: false`). 503 only when the config store itself +// isn't available on this gateway (e.g. push subsystem disabled). +func (h *Handlers) GetConfigHandler(w http.ResponseWriter, r *http.Request) { + if h.configStore == nil { + writeError(w, http.StatusServiceUnavailable, + "push config store not available on this gateway") + return + } + if r.Method != http.MethodGet { + writeError(w, http.StatusMethodNotAllowed, "method not allowed") + return + } + ns := resolveNamespace(r) + if ns == "" { + writeError(w, http.StatusForbidden, "namespace not resolved") + return + } + + cfg, err := h.configStore.Get(boundCtx(r), ns) + if err != nil && !errors.Is(err, push.ErrConfigNotFound) { + h.logger.ComponentWarn("push", "config GET failed", + zap.String("namespace", ns), zap.Error(err)) + writeError(w, http.StatusInternalServerError, "failed to load config") + return + } + // Not found → return empty redacted config. Clients distinguish + // "configured" via the boolean fields. + if cfg == nil { + writeJSON(w, http.StatusOK, push.RedactedConfig{Namespace: ns}) + return + } + writeJSON(w, http.StatusOK, cfg.Redacted()) +} + +// PutConfigHandler — PUT /v1/push/config. Updates the namespace's push +// provider config. Field-level semantics: nil JSON values leave the +// existing field untouched; explicit empty-strings clear it. +// +// On success returns the redacted config (same shape as GET) so clients +// can confirm what's now in place without echoing back the credentials. +// +// Invalidates the manager's cached dispatcher for this namespace so the +// next push send rebuilds with the fresh config. +func (h *Handlers) PutConfigHandler(w http.ResponseWriter, r *http.Request) { + if h.configStore == nil { + writeError(w, http.StatusServiceUnavailable, + "push config store not available on this gateway") + return + } + if r.Method != http.MethodPut && r.Method != http.MethodPost { + writeError(w, http.StatusMethodNotAllowed, "method not allowed (use PUT)") + return + } + ns := resolveNamespace(r) + if ns == "" { + writeError(w, http.StatusForbidden, "namespace not resolved") + return + } + caller := resolveCallerUserID(r) + if caller == "" { + writeError(w, http.StatusUnauthorized, "user authentication required") + return + } + + r.Body = http.MaxBytesReader(w, r.Body, MaxConfigBodyBytes) + var body PutConfigRequest + if err := json.NewDecoder(r.Body).Decode(&body); err != nil { + writeError(w, http.StatusBadRequest, "invalid body: expected JSON") + return + } + + // Validate URL fields look reasonable. We don't do hostname resolution + // here (slow, flaky); just reject obviously-wrong schemes. + if body.NtfyBaseURL != nil && *body.NtfyBaseURL != "" { + if !strings.HasPrefix(*body.NtfyBaseURL, "http://") && + !strings.HasPrefix(*body.NtfyBaseURL, "https://") { + writeError(w, http.StatusBadRequest, + "ntfy_base_url must start with http:// or https://") + return + } + } + + // Read existing for merge — PUT semantics are field-level, not + // whole-document replace. + existing, err := h.configStore.Get(boundCtx(r), ns) + if err != nil && !errors.Is(err, push.ErrConfigNotFound) { + h.logger.ComponentWarn("push", "config GET-before-PUT failed", + zap.String("namespace", ns), zap.Error(err)) + writeError(w, http.StatusInternalServerError, "failed to load config") + return + } + cfg := push.Config{Namespace: ns, UpdatedAt: time.Now().Unix(), UpdatedBy: caller} + if existing != nil { + cfg.NtfyBaseURL = existing.NtfyBaseURL + cfg.NtfyAuthToken = existing.NtfyAuthToken + cfg.ExpoAccessToken = existing.ExpoAccessToken + } + if body.NtfyBaseURL != nil { + cfg.NtfyBaseURL = *body.NtfyBaseURL + } + if body.NtfyAuthToken != nil { + cfg.NtfyAuthToken = *body.NtfyAuthToken + } + if body.ExpoAccessToken != nil { + cfg.ExpoAccessToken = *body.ExpoAccessToken + } + + if err := h.configStore.Upsert(boundCtx(r), cfg); err != nil { + h.logger.ComponentWarn("push", "config PUT failed", + zap.String("namespace", ns), zap.Error(err)) + writeError(w, http.StatusInternalServerError, "failed to save config") + return + } + if h.manager != nil { + h.manager.Invalidate(ns) + } + + h.logger.ComponentInfo("push", "config updated", + zap.String("namespace", ns), + zap.String("updated_by", caller), + zap.Bool("has_ntfy_url", cfg.NtfyBaseURL != ""), + zap.Bool("has_ntfy_auth_token", cfg.NtfyAuthToken != ""), + zap.Bool("has_expo_access_token", cfg.ExpoAccessToken != ""), + ) + writeJSON(w, http.StatusOK, cfg.Redacted()) +} + +// DeleteConfigHandler — DELETE /v1/push/config. Clears the namespace's +// row entirely; push reverts to gateway YAML defaults (or 503 "not +// configured" if no defaults). +func (h *Handlers) DeleteConfigHandler(w http.ResponseWriter, r *http.Request) { + if h.configStore == nil { + writeError(w, http.StatusServiceUnavailable, + "push config store not available on this gateway") + return + } + if r.Method != http.MethodDelete { + writeError(w, http.StatusMethodNotAllowed, "method not allowed") + return + } + ns := resolveNamespace(r) + if ns == "" { + writeError(w, http.StatusForbidden, "namespace not resolved") + return + } + caller := resolveCallerUserID(r) + if caller == "" { + writeError(w, http.StatusUnauthorized, "user authentication required") + return + } + + if err := h.configStore.Delete(boundCtx(r), ns); err != nil { + h.logger.ComponentWarn("push", "config DELETE failed", + zap.String("namespace", ns), zap.Error(err)) + writeError(w, http.StatusInternalServerError, "failed to delete config") + return + } + if h.manager != nil { + h.manager.Invalidate(ns) + } + h.logger.ComponentInfo("push", "config cleared", + zap.String("namespace", ns), + zap.String("cleared_by", caller), + ) + writeJSON(w, http.StatusOK, map[string]string{"status": "cleared"}) +} diff --git a/core/pkg/gateway/handlers/push/handlers.go b/core/pkg/gateway/handlers/push/handlers.go index 36eff7b..5b1e35f 100644 --- a/core/pkg/gateway/handlers/push/handlers.go +++ b/core/pkg/gateway/handlers/push/handlers.go @@ -2,6 +2,7 @@ package push import ( "encoding/json" + "errors" "net/http" "strings" "time" @@ -221,7 +222,9 @@ func (h *Handlers) DeleteDeviceHandler(w http.ResponseWriter, r *http.Request) { // exposing this in production.** The WASM hostfunc bypasses this issue // because trigger registration already gates which functions exist. func (h *Handlers) SendHandler(w http.ResponseWriter, r *http.Request) { - if h.dispatcher == nil { + // Either the per-namespace manager (preferred) or the legacy single + // dispatcher must be present. Both nil = push not configured at all. + if h.manager == nil && h.dispatcher == nil { writeError(w, http.StatusServiceUnavailable, "push: dispatcher not configured") return } @@ -261,14 +264,27 @@ func (h *Handlers) SendHandler(w http.ResponseWriter, r *http.Request) { Sound: body.Sound, Data: body.Data, } - if err := h.dispatcher.SendToUser(boundCtx(r), ns, body.UserID, msg); err != nil { + // Prefer the per-namespace Manager when present so per-namespace + // config (set via PUT /v1/push/config) takes effect. Fall back to the + // legacy single dispatcher only when no Manager is wired. + var sendErr error + if h.manager != nil { + sendErr = h.manager.SendToUser(boundCtx(r), ns, body.UserID, msg) + if errors.Is(sendErr, push.ErrPushNotConfigured) { + writeError(w, http.StatusServiceUnavailable, sendErr.Error()) + return + } + } else { + sendErr = h.dispatcher.SendToUser(boundCtx(r), ns, body.UserID, msg) + } + if sendErr != nil { // Treat as non-fatal: some devices may have failed but others may // have succeeded. Surface as 502 to signal partial trouble; logs // have the per-device detail. h.logger.ComponentWarn("push", "send to user partially failed", zap.String("namespace", ns), zap.String("user_id", body.UserID), - zap.Error(err)) + zap.Error(sendErr)) writeError(w, http.StatusBadGateway, "one or more devices failed") return } diff --git a/core/pkg/gateway/handlers/push/types.go b/core/pkg/gateway/handlers/push/types.go index 8410409..da66337 100644 --- a/core/pkg/gateway/handlers/push/types.go +++ b/core/pkg/gateway/handlers/push/types.go @@ -26,14 +26,26 @@ import ( // Handlers serves the /v1/push/* HTTP endpoints. Construct via NewHandlers; // it's safe for concurrent use. +// +// dispatcher is the legacy single-tier dispatcher (kept for the device +// register/list/delete + send paths). manager is the per-namespace +// dispatcher built on top of ConfigStore (new in bug #220 follow-up); +// when both are present, send paths route through manager so per-namespace +// config wins. +// +// configStore + manager may be nil on gateways with push fully disabled — +// the corresponding endpoints return 503. type Handlers struct { - dispatcher *push.PushDispatcher - store push.PushDeviceStore - logger *logging.ColoredLogger + dispatcher *push.PushDispatcher + manager *push.Manager + store push.PushDeviceStore + configStore push.ConfigStore + logger *logging.ColoredLogger } -// NewHandlers constructs a Handlers. Either argument may be nil — in which -// case the corresponding endpoints return 503 Service Unavailable. +// NewHandlers constructs a Handlers with the legacy single-namespace +// dispatcher only. Use NewHandlersWithManager for per-namespace config +// support (bug #220 follow-up). func NewHandlers(dispatcher *push.PushDispatcher, store push.PushDeviceStore, logger *logging.ColoredLogger) *Handlers { return &Handlers{ dispatcher: dispatcher, @@ -42,6 +54,23 @@ func NewHandlers(dispatcher *push.PushDispatcher, store push.PushDeviceStore, lo } } +// NewHandlersWithManager constructs Handlers wired to a Manager + ConfigStore +// for tenant-self-service per-namespace configuration. Send paths use the +// manager when present so per-namespace ntfy/expo settings take effect. +func NewHandlersWithManager( + manager *push.Manager, + configStore push.ConfigStore, + deviceStore push.PushDeviceStore, + logger *logging.ColoredLogger, +) *Handlers { + return &Handlers{ + manager: manager, + configStore: configStore, + store: deviceStore, + logger: logger, + } +} + // RegisterDeviceRequest is the body of POST /v1/push/devices. // // `device_id` is an app-supplied stable identifier (e.g. the OS-assigned diff --git a/core/pkg/gateway/handlers/serverless/deploy_handler.go b/core/pkg/gateway/handlers/serverless/deploy_handler.go index df148a3..505e832 100644 --- a/core/pkg/gateway/handlers/serverless/deploy_handler.go +++ b/core/pkg/gateway/handlers/serverless/deploy_handler.go @@ -173,6 +173,43 @@ func (h *ServerlessHandlers) DeployFunction(w http.ResponseWriter, r *http.Reque } } + // Register Cron triggers from definition. Mirrors the PubSub branch above: + // stale rows (from a previous deploy whose manifest had different cron + // schedules) are cleared first, then the manifest's expressions are + // re-added. Without this, manifest-driven cron schedules silently never + // fired (feature #65 audit). + // + // We always run the RemoveByFunction even when CronExpressions is empty, + // otherwise editing a manifest from `cron_expressions: ["0 3 * * *"]` to + // `cron_expressions: []` would leave the old schedule in place. + if h.cronStore != nil && fn != nil { + if err := h.cronStore.RemoveByFunction(ctx, fn.ID); err != nil { + h.logger.Warn("Failed to clear stale cron triggers", + zap.String("function", def.Name), + zap.Error(err)) + } + // Dedupe identical expressions so a manifest accident + // (`cron_expressions: ["0 3 * * *", "0 3 * * *"]`) doesn't fire the + // function twice every tick. + seen := make(map[string]struct{}, len(def.CronExpressions)) + for _, expr := range def.CronExpressions { + if _, dup := seen[expr]; dup { + continue + } + seen[expr] = struct{}{} + if _, err := h.cronStore.Add(ctx, fn.ID, expr); err != nil { + // Bad expression in a manifest is a user error worth surfacing + // but not blocking the deploy — the function itself is fine, + // only the schedule is dropped. Logged at WARN so operators + // see it; the deploy response still reports success. + h.logger.Warn("Failed to register cron trigger from manifest", + zap.String("function", def.Name), + zap.String("cron_expression", expr), + zap.Error(err)) + } + } + } + writeJSON(w, http.StatusCreated, map[string]interface{}{ "message": "Function deployed successfully", "function": fn, diff --git a/core/pkg/gateway/instance_spawner.go b/core/pkg/gateway/instance_spawner.go index a3d56dd..618c29a 100644 --- a/core/pkg/gateway/instance_spawner.go +++ b/core/pkg/gateway/instance_spawner.go @@ -110,21 +110,29 @@ type GatewayYAMLWebRTC struct { // This must match the yamlCfg struct in cmd/gateway/config.go exactly // because the gateway uses strict YAML decoding that rejects unknown fields type GatewayYAMLConfig struct { - ListenAddr string `yaml:"listen_addr"` - ClientNamespace string `yaml:"client_namespace"` - RQLiteDSN string `yaml:"rqlite_dsn"` - GlobalRQLiteDSN string `yaml:"global_rqlite_dsn,omitempty"` - BootstrapPeers []string `yaml:"bootstrap_peers,omitempty"` - EnableHTTPS bool `yaml:"enable_https,omitempty"` - DomainName string `yaml:"domain_name,omitempty"` - TLSCacheDir string `yaml:"tls_cache_dir,omitempty"` - OlricServers []string `yaml:"olric_servers"` - OlricTimeout string `yaml:"olric_timeout,omitempty"` - IPFSClusterAPIURL string `yaml:"ipfs_cluster_api_url,omitempty"` - IPFSAPIURL string `yaml:"ipfs_api_url,omitempty"` - IPFSTimeout string `yaml:"ipfs_timeout,omitempty"` - IPFSReplicationFactor int `yaml:"ipfs_replication_factor,omitempty"` - WebRTC GatewayYAMLWebRTC `yaml:"webrtc,omitempty"` + ListenAddr string `yaml:"listen_addr"` + ClientNamespace string `yaml:"client_namespace"` + RQLiteDSN string `yaml:"rqlite_dsn"` + GlobalRQLiteDSN string `yaml:"global_rqlite_dsn,omitempty"` + BootstrapPeers []string `yaml:"bootstrap_peers,omitempty"` + EnableHTTPS bool `yaml:"enable_https,omitempty"` + DomainName string `yaml:"domain_name,omitempty"` + TLSCacheDir string `yaml:"tls_cache_dir,omitempty"` + OlricServers []string `yaml:"olric_servers"` + OlricTimeout string `yaml:"olric_timeout,omitempty"` + IPFSClusterAPIURL string `yaml:"ipfs_cluster_api_url,omitempty"` + IPFSAPIURL string `yaml:"ipfs_api_url,omitempty"` + IPFSTimeout string `yaml:"ipfs_timeout,omitempty"` + IPFSReplicationFactor int `yaml:"ipfs_replication_factor,omitempty"` + WebRTC GatewayYAMLWebRTC `yaml:"webrtc,omitempty"` + // ClusterSecretPath points to the host's cluster-secret file. Bug #215 + // follow-up: namespace gateways spawned by systemd previously had no + // way to access the cluster secret, so they fell back to per-node + // random JWT signing keys and JWTs were unverifiable cross-node within + // the namespace cluster. Setting this path lets the standalone gateway + // binary read the secret on startup and derive the canonical Ed25519 + // key shared with every other gateway in the cluster. + ClusterSecretPath string `yaml:"cluster_secret_path,omitempty"` } // NewInstanceSpawner creates a new Gateway instance spawner diff --git a/core/pkg/gateway/instance_spawner_test.go b/core/pkg/gateway/instance_spawner_test.go new file mode 100644 index 0000000..56b210a --- /dev/null +++ b/core/pkg/gateway/instance_spawner_test.go @@ -0,0 +1,87 @@ +package gateway + +import ( + "strings" + "testing" + + "gopkg.in/yaml.v3" +) + +// TestGatewayYAMLConfig_clusterSecretPathRoundTrip is the regression test for +// bug #215 reopen. The first fix derived JWT keys from cfg.ClusterSecret, but +// namespace gateways spawned via systemd had cfg.ClusterSecret == "" because +// the YAML schema lacked any field to carry it. This test guards the YAML tag +// so every namespace gateway YAML written by SystemdSpawner.SpawnGateway +// carries the path the standalone binary needs. +func TestGatewayYAMLConfig_clusterSecretPathRoundTrip(t *testing.T) { + cfg := GatewayYAMLConfig{ + ListenAddr: ":6001", + ClientNamespace: "anchat-test", + RQLiteDSN: "http://localhost:10000", + OlricServers: []string{"localhost:3320"}, + ClusterSecretPath: "/opt/orama/.orama/secrets/cluster-secret", + } + out, err := yaml.Marshal(cfg) + if err != nil { + t.Fatalf("marshal: %v", err) + } + if !strings.Contains(string(out), "cluster_secret_path: /opt/orama/.orama/secrets/cluster-secret") { + t.Fatalf("YAML output missing expected cluster_secret_path line:\n%s", out) + } + + // Round-trip into a struct shaped like cmd/gateway/config.go's yamlCfg + // (internal type, duplicated here intentionally so this test catches + // drift between the two declarations). + type webrtc struct { + Enabled bool `yaml:"enabled"` + SFUPort int `yaml:"sfu_port"` + TURNDomain string `yaml:"turn_domain"` + TURNSecret string `yaml:"turn_secret"` + } + type yamlCfgMirror struct { + ListenAddr string `yaml:"listen_addr"` + ClientNamespace string `yaml:"client_namespace"` + RQLiteDSN string `yaml:"rqlite_dsn"` + GlobalRQLiteDSN string `yaml:"global_rqlite_dsn"` + Peers []string `yaml:"bootstrap_peers"` + EnableHTTPS bool `yaml:"enable_https"` + DomainName string `yaml:"domain_name"` + TLSCacheDir string `yaml:"tls_cache_dir"` + OlricServers []string `yaml:"olric_servers"` + OlricTimeout string `yaml:"olric_timeout"` + IPFSClusterAPIURL string `yaml:"ipfs_cluster_api_url"` + IPFSAPIURL string `yaml:"ipfs_api_url"` + IPFSTimeout string `yaml:"ipfs_timeout"` + IPFSReplicationFactor int `yaml:"ipfs_replication_factor"` + WebRTC webrtc `yaml:"webrtc"` + ClusterSecretPath string `yaml:"cluster_secret_path"` + } + var parsed yamlCfgMirror + if err := yaml.Unmarshal(out, &parsed); err != nil { + t.Fatalf("unmarshal: %v", err) + } + if parsed.ClusterSecretPath != cfg.ClusterSecretPath { + t.Errorf("round-trip mismatch: got %q, want %q", parsed.ClusterSecretPath, cfg.ClusterSecretPath) + } +} + +// TestGatewayYAMLConfig_omitWhenEmpty: when the host has no cluster secret, +// the field is omitted from the YAML so legacy single-node test rigs don't +// see a stray "cluster_secret_path: " line that operators might mistake for +// a configuration directive. +func TestGatewayYAMLConfig_omitWhenEmpty(t *testing.T) { + cfg := GatewayYAMLConfig{ + ListenAddr: ":6001", + ClientNamespace: "ns", + RQLiteDSN: "http://localhost:10000", + OlricServers: []string{"localhost:3320"}, + // ClusterSecretPath intentionally empty. + } + out, err := yaml.Marshal(cfg) + if err != nil { + t.Fatalf("marshal: %v", err) + } + if strings.Contains(string(out), "cluster_secret_path") { + t.Errorf("empty ClusterSecretPath should be omitted from YAML; got:\n%s", out) + } +} diff --git a/core/pkg/gateway/internal_auth_test.go b/core/pkg/gateway/internal_auth_test.go new file mode 100644 index 0000000..8e12c2e --- /dev/null +++ b/core/pkg/gateway/internal_auth_test.go @@ -0,0 +1,194 @@ +package gateway + +import ( + "net/http" + "reflect" + "testing" + + "github.com/DeBrosOfficial/network/pkg/gateway/auth" +) + +// TestSetInternalAuthJWTHeaders_setsSubAndCustom verifies that the proxy +// helper writes the validated JWT subject and base64(JSON) custom claims to +// outbound headers (bug #215 wiring). +func TestSetInternalAuthJWTHeaders_setsSubAndCustom(t *testing.T) { + h := http.Header{} + claims := &auth.JWTClaims{ + Sub: "BNbN2RNQTsYrrywZCLnhV9j3hd38jwcRqfxBecZX7hDE", + Custom: map[string]string{ + "tier": "pro", + "subscription": "active", + }, + } + setInternalAuthJWTHeaders(h, claims) + + if got := h.Get(HeaderInternalAuthJWTSub); got != claims.Sub { + t.Errorf("Sub header = %q, want %q", got, claims.Sub) + } + if h.Get(HeaderInternalAuthJWTCustom) == "" { + t.Error("Custom header is empty, expected base64 JSON blob") + } +} + +// TestSetInternalAuthJWTHeaders_nilClaimsNoOp guards the API-key path: when +// no JWT was used to authenticate, no JWT headers should be set. +func TestSetInternalAuthJWTHeaders_nilClaimsNoOp(t *testing.T) { + h := http.Header{} + setInternalAuthJWTHeaders(h, nil) + if got := h.Get(HeaderInternalAuthJWTSub); got != "" { + t.Errorf("Sub header = %q, want empty (nil claims)", got) + } + if got := h.Get(HeaderInternalAuthJWTCustom); got != "" { + t.Errorf("Custom header = %q, want empty (nil claims)", got) + } +} + +// TestSetInternalAuthJWTHeaders_emptySubNotForwarded keeps malformed claims +// from polluting the outbound request. +func TestSetInternalAuthJWTHeaders_emptySubNotForwarded(t *testing.T) { + h := http.Header{} + setInternalAuthJWTHeaders(h, &auth.JWTClaims{Sub: " "}) + if got := h.Get(HeaderInternalAuthJWTSub); got != "" { + t.Errorf("Sub header = %q, want empty (whitespace-only sub)", got) + } +} + +// TestSetInternalAuthJWTHeaders_emptyCustomMapNotForwarded — empty maps are +// skipped to keep headers small and avoid sending `e30=` (base64 of "{}"). +func TestSetInternalAuthJWTHeaders_emptyCustomMapNotForwarded(t *testing.T) { + h := http.Header{} + setInternalAuthJWTHeaders(h, &auth.JWTClaims{Sub: "0xabc", Custom: map[string]string{}}) + if got := h.Get(HeaderInternalAuthJWTCustom); got != "" { + t.Errorf("Custom header = %q, want empty (empty custom map)", got) + } +} + +// TestClaimsFromInternalAuthHeaders_roundTrip is the end-to-end guarantee: +// what the main gateway sets via setInternalAuthJWTHeaders MUST be exactly +// what the namespace gateway recovers via claimsFromInternalAuthHeaders. +func TestClaimsFromInternalAuthHeaders_roundTrip(t *testing.T) { + original := &auth.JWTClaims{ + Sub: "BNbN2RNQTsYrrywZCLnhV9j3hd38jwcRqfxBecZX7hDE", + Custom: map[string]string{ + "tier": "pro", + "subscription": "active", + }, + } + h := http.Header{} + setInternalAuthJWTHeaders(h, original) + + got := claimsFromInternalAuthHeaders(h, "anchat-test") + if got == nil { + t.Fatal("recovered claims is nil") + } + if got.Sub != original.Sub { + t.Errorf("Sub = %q, want %q", got.Sub, original.Sub) + } + if got.Namespace != "anchat-test" { + t.Errorf("Namespace = %q, want %q", got.Namespace, "anchat-test") + } + if !reflect.DeepEqual(got.Custom, original.Custom) { + t.Errorf("Custom = %#v, want %#v", got.Custom, original.Custom) + } +} + +// TestClaimsFromInternalAuthHeaders_noSubReturnsNil — when no JWT was +// forwarded (caller used API key), recovery returns nil so the namespace +// gateway leaves ctxKeyJWT unset. +func TestClaimsFromInternalAuthHeaders_noSubReturnsNil(t *testing.T) { + h := http.Header{} + if got := claimsFromInternalAuthHeaders(h, "ns"); got != nil { + t.Errorf("expected nil claims when no Sub header present, got %#v", got) + } +} + +// TestClaimsFromInternalAuthHeaders_invalidCustomIgnored — corrupt custom +// blob must not block recovery of the subject. +func TestClaimsFromInternalAuthHeaders_invalidCustomIgnored(t *testing.T) { + h := http.Header{} + h.Set(HeaderInternalAuthJWTSub, "0xabc") + h.Set(HeaderInternalAuthJWTCustom, "not-valid-base64!!!") + + got := claimsFromInternalAuthHeaders(h, "ns") + if got == nil { + t.Fatal("recovered claims is nil despite valid Sub header") + } + if got.Sub != "0xabc" { + t.Errorf("Sub = %q, want %q", got.Sub, "0xabc") + } + if got.Custom != nil { + t.Errorf("Custom = %#v, want nil (invalid blob should be silently dropped)", got.Custom) + } +} + +// TestStripInboundInternalAuthHeaders_removesAllForgedHeaders is the regression +// test for the security audit's CRITICAL finding (bug #215 follow-up). Without +// this strip, an external attacker could forge X-Internal-Auth-JWT-Sub on a +// public path and impersonate any wallet on the namespace gateway, which gates +// on source IP only. +func TestStripInboundInternalAuthHeaders_removesAllForgedHeaders(t *testing.T) { + h := http.Header{} + h.Set(HeaderInternalAuthValidated, "true") + h.Set(HeaderInternalAuthNamespace, "victim-namespace") + h.Set(HeaderInternalAuthJWTSub, "0xVICTIM_WALLET") + h.Set(HeaderInternalAuthJWTCustom, "ZXZpbA==") // base64("evil") + // Unrelated headers MUST survive the strip. + h.Set("X-Forwarded-For", "1.2.3.4") + h.Set("Authorization", "Bearer some-token") + + stripInboundInternalAuthHeaders(h) + + for _, name := range []string{ + HeaderInternalAuthValidated, + HeaderInternalAuthNamespace, + HeaderInternalAuthJWTSub, + HeaderInternalAuthJWTCustom, + } { + if got := h.Get(name); got != "" { + t.Errorf("after strip, %s = %q, want empty", name, got) + } + } + if got := h.Get("X-Forwarded-For"); got != "1.2.3.4" { + t.Errorf("X-Forwarded-For lost: %q", got) + } + if got := h.Get("Authorization"); got != "Bearer some-token" { + t.Errorf("Authorization lost: %q", got) + } +} + +// TestStripThenSet_attackerSubReplaced exercises the proxy-hop sequence: +// strip-then-set. An attacker-controlled JWT-Sub header MUST be replaced by +// the gateway-validated value (or removed entirely when the caller used an +// API key and validatedClaims is nil). +func TestStripThenSet_attackerSubReplaced(t *testing.T) { + t.Run("validated_jwt_overwrites_attacker_sub", func(t *testing.T) { + h := http.Header{} + h.Set(HeaderInternalAuthJWTSub, "0xATTACKER") + + stripInboundInternalAuthHeaders(h) + // Mimic the proxy hop: gateway authenticated user as 0xLEGIT. + h.Set(HeaderInternalAuthValidated, "true") + h.Set(HeaderInternalAuthNamespace, "ns") + setInternalAuthJWTHeaders(h, &auth.JWTClaims{Sub: "0xLEGIT"}) + + if got := h.Get(HeaderInternalAuthJWTSub); got != "0xLEGIT" { + t.Errorf("Sub = %q, want %q (attacker value should be overwritten)", got, "0xLEGIT") + } + }) + + t.Run("api_key_path_drops_attacker_sub", func(t *testing.T) { + h := http.Header{} + h.Set(HeaderInternalAuthJWTSub, "0xATTACKER") + + stripInboundInternalAuthHeaders(h) + // Mimic the proxy hop: gateway authenticated user via API key + // (validatedClaims is nil). + h.Set(HeaderInternalAuthValidated, "true") + h.Set(HeaderInternalAuthNamespace, "ns") + setInternalAuthJWTHeaders(h, nil) + + if got := h.Get(HeaderInternalAuthJWTSub); got != "" { + t.Errorf("Sub = %q, want empty (attacker value must not survive API-key auth)", got) + } + }) +} diff --git a/core/pkg/gateway/middleware.go b/core/pkg/gateway/middleware.go index e5fc6b1..7332d9a 100644 --- a/core/pkg/gateway/middleware.go +++ b/core/pkg/gateway/middleware.go @@ -2,6 +2,8 @@ package gateway import ( "context" + "encoding/base64" + "encoding/json" "errors" "fmt" "hash/fnv" @@ -70,27 +72,114 @@ const ( HeaderInternalAuthNamespace = "X-Internal-Auth-Namespace" // HeaderInternalAuthValidated indicates the request was pre-authenticated by main gateway HeaderInternalAuthValidated = "X-Internal-Auth-Validated" + // HeaderInternalAuthJWTSub carries the validated JWT `sub` claim across the + // internal proxy hop. Bug #215: without this, namespace gateways received + // only the namespace and host functions saw empty `caller_jwt_subject`. + // Trusted only when X-Internal-Auth-Validated=true AND the source IP is + // internal (WireGuard mesh / loopback) — same trust model as the + // namespace header. + HeaderInternalAuthJWTSub = "X-Internal-Auth-JWT-Sub" + // HeaderInternalAuthJWTCustom carries the validated JWT custom-claims map + // as a base64(JSON) blob. Optional; absent means the original token had + // no custom claims. Same trust gate as HeaderInternalAuthJWTSub. + HeaderInternalAuthJWTCustom = "X-Internal-Auth-JWT-Custom" ) +// setInternalAuthJWTHeaders writes the validated JWT subject and custom claims +// onto an outbound proxy request so the destination gateway can hydrate +// ctxKeyJWT. Called only after auth has been validated AND we're proxying over +// the WireGuard mesh (the destination still gates trust by source IP). +// +// No-op when claims is nil (caller authenticated with API key, not JWT). +func setInternalAuthJWTHeaders(h http.Header, claims *auth.JWTClaims) { + if claims == nil { + return + } + if sub := strings.TrimSpace(claims.Sub); sub != "" { + h.Set(HeaderInternalAuthJWTSub, sub) + } + if len(claims.Custom) > 0 { + buf, err := json.Marshal(claims.Custom) + if err == nil { + h.Set(HeaderInternalAuthJWTCustom, base64.StdEncoding.EncodeToString(buf)) + } + } +} + +// stripInboundInternalAuthHeaders deletes the X-Internal-Auth-* headers from +// the supplied header set. MUST be called on every outbound request the main +// gateway proxies to a namespace gateway BEFORE we (conditionally) re-set +// them with the validated values. +// +// SECURITY (bug #215 follow-up — critical): without this, an external client +// could send `X-Internal-Auth-Validated: true` + `X-Internal-Auth-JWT-Sub: +// ` directly. The header-copy loop at the proxy hop would +// forward those headers verbatim. The namespace gateway, seeing the request +// arrive from the main gateway's WireGuard IP, would trust them and hydrate +// ctxKeyJWT with attacker-controlled subject — full cross-namespace identity +// spoofing on every public path. +// +// Stripping FIRST and conditionally setting AFTER closes that hole regardless +// of which auth path won (JWT, API key, or no creds on a public path). +func stripInboundInternalAuthHeaders(h http.Header) { + h.Del(HeaderInternalAuthValidated) + h.Del(HeaderInternalAuthNamespace) + h.Del(HeaderInternalAuthJWTSub) + h.Del(HeaderInternalAuthJWTCustom) +} + +// claimsFromInternalAuthHeaders rebuilds a *auth.JWTClaims from the trusted +// internal-auth headers. Returns nil if no JWT subject was forwarded (the +// caller used an API key, or the request didn't carry validated JWT data). +// +// SECURITY: this MUST only be called after the caller has confirmed +// HeaderInternalAuthValidated == "true" AND the source IP is internal AND +// the proxy hop has stripped any inbound copies of these headers (see +// stripInboundInternalAuthHeaders). Skipping any of those would let any +// client forge JWT identities. +func claimsFromInternalAuthHeaders(h http.Header, namespace string) *auth.JWTClaims { + sub := strings.TrimSpace(h.Get(HeaderInternalAuthJWTSub)) + if sub == "" { + return nil + } + claims := &auth.JWTClaims{ + Sub: sub, + Namespace: namespace, + } + if raw := strings.TrimSpace(h.Get(HeaderInternalAuthJWTCustom)); raw != "" { + if decoded, err := base64.StdEncoding.DecodeString(raw); err == nil { + var custom map[string]string + if json.Unmarshal(decoded, &custom) == nil && len(custom) > 0 { + claims.Custom = custom + } + } + } + return claims +} + // validateAuthForNamespaceProxy validates the request's auth credentials against the MAIN -// cluster RQLite and returns the namespace the credentials belong to. +// cluster RQLite and returns the namespace the credentials belong to plus the +// JWT claims when the caller authenticated with a Bearer JWT. // This is used by handleNamespaceGatewayRequest to pre-authenticate before proxying to // namespace gateways (which have isolated RQLites without API keys). // // Returns: -// - (namespace, "") if auth is valid -// - ("", errorMessage) if auth is invalid -// - ("", "") if no auth credentials provided (for public paths) -func (g *Gateway) validateAuthForNamespaceProxy(r *http.Request) (namespace string, errMsg string) { +// - (namespace, claims, "") if auth is valid via JWT — claims may be used to +// populate internal-auth headers so the namespace gateway can hydrate +// `caller_jwt_subject` for serverless host functions (bug #215). +// - (namespace, nil, "") if auth is valid via API key. +// - ("", nil, errorMessage) if auth is invalid. +// - ("", nil, "") if no auth credentials provided (for public paths). +func (g *Gateway) validateAuthForNamespaceProxy(r *http.Request) (namespace string, claims *auth.JWTClaims, errMsg string) { // 1) Try JWT Bearer first - if auth := r.Header.Get("Authorization"); auth != "" { - lower := strings.ToLower(auth) + if authHeader := r.Header.Get("Authorization"); authHeader != "" { + lower := strings.ToLower(authHeader) if strings.HasPrefix(lower, "bearer ") { - tok := strings.TrimSpace(auth[len("Bearer "):]) + tok := strings.TrimSpace(authHeader[len("Bearer "):]) if strings.Count(tok, ".") == 2 { - if claims, err := g.authService.ParseAndVerifyJWT(tok); err == nil { - if ns := strings.TrimSpace(claims.Namespace); ns != "" { - return ns, "" + if c, err := g.authService.ParseAndVerifyJWT(tok); err == nil { + if ns := strings.TrimSpace(c.Namespace); ns != "" { + return ns, c, "" } } // JWT verification failed - fall through to API key check @@ -101,14 +190,14 @@ func (g *Gateway) validateAuthForNamespaceProxy(r *http.Request) (namespace stri // 2) Try API key key := extractAPIKey(r) if key == "" { - return "", "" // No credentials provided + return "", nil, "" // No credentials provided } ns, err := g.lookupAPIKeyNamespace(r.Context(), key, g.client) if err != nil { - return "", "invalid API key" + return "", nil, "invalid API key" } - return ns, "" + return ns, nil, "" } // lookupAPIKeyNamespace resolves an API key to its namespace using cache and DB. @@ -325,6 +414,17 @@ func (g *Gateway) authMiddleware(next http.Handler) http.Handler { if ns != "" { // Pre-authenticated by main gateway - trust the namespace reqCtx := context.WithValue(r.Context(), CtxKeyNamespaceOverride, ns) + // Bug #215: also rebuild ctxKeyJWT from the trusted + // internal-auth headers (subject + custom claims) so + // serverless host functions see a non-empty + // caller_jwt_subject. We deliberately do NOT re-verify + // the JWT here — namespace gateways may not share the + // signing key with the main gateway, and the main gateway + // already verified before forwarding. Trust gate is the + // source-IP check above. + if claims := claimsFromInternalAuthHeaders(r.Header, ns); claims != nil { + reqCtx = context.WithValue(reqCtx, ctxKeyJWT, claims) + } next.ServeHTTP(w, r.WithContext(reqCtx)) return } @@ -916,7 +1016,7 @@ func (g *Gateway) domainRoutingMiddleware(next http.Handler) http.Handler { func (g *Gateway) handleNamespaceGatewayRequest(w http.ResponseWriter, r *http.Request, namespaceName string) { // Validate auth against main cluster RQLite BEFORE proxying // This ensures API keys work even though they're not in the namespace's RQLite - validatedNamespace, authErr := g.validateAuthForNamespaceProxy(r) + validatedNamespace, validatedClaims, authErr := g.validateAuthForNamespaceProxy(r) if authErr != "" && !isPublicPath(r.URL.Path) { w.Header().Set("WWW-Authenticate", "Bearer error=\"invalid_token\"") writeError(w, http.StatusUnauthorized, authErr) @@ -1078,10 +1178,19 @@ func (g *Gateway) handleNamespaceGatewayRequest(w http.ResponseWriter, r *http.R r.Header.Set("X-Forwarded-For", getClientIP(r)) r.Header.Set("X-Forwarded-Proto", "https") r.Header.Set("X-Forwarded-Host", r.Host) + // SECURITY (bug #215 follow-up): drop any X-Internal-Auth-* headers + // the external client may have set BEFORE applying the trusted + // values from this gateway. Prevents identity spoofing on the + // namespace gateway, which gates on source IP only. + stripInboundInternalAuthHeaders(r.Header) // Set internal auth headers if auth was validated if validatedNamespace != "" { r.Header.Set(HeaderInternalAuthValidated, "true") r.Header.Set(HeaderInternalAuthNamespace, validatedNamespace) + // Bug #215: forward validated JWT subject + custom claims so the + // namespace gateway's auth middleware can hydrate ctxKeyJWT and + // host functions see a non-empty caller_jwt_subject. + setInternalAuthJWTHeaders(r.Header, validatedClaims) } r.URL.Scheme = "http" r.URL.Host = targetHost @@ -1120,11 +1229,21 @@ func (g *Gateway) handleNamespaceGatewayRequest(w http.ResponseWriter, r *http.R proxyReq.Header.Set("X-Forwarded-Host", r.Host) proxyReq.Header.Set("X-Original-Host", r.Host) + // SECURITY (bug #215 follow-up): drop any X-Internal-Auth-* headers + // the header-copy loop above may have inherited from the inbound + // request BEFORE writing the trusted values. The namespace gateway + // gates on source IP only; without this strip, an external attacker + // could forge X-Internal-Auth-JWT-Sub and impersonate any wallet. + stripInboundInternalAuthHeaders(proxyReq.Header) // Set internal auth headers if auth was validated by main gateway // This allows the namespace gateway to trust the authentication if validatedNamespace != "" { proxyReq.Header.Set(HeaderInternalAuthValidated, "true") proxyReq.Header.Set(HeaderInternalAuthNamespace, validatedNamespace) + // Bug #215: forward validated JWT subject + custom claims so the + // namespace gateway's auth middleware can hydrate ctxKeyJWT and + // host functions see a non-empty caller_jwt_subject. + setInternalAuthJWTHeaders(proxyReq.Header, validatedClaims) } // Pick the proxy timeout based on the path's expected work bound. @@ -1353,6 +1472,11 @@ serveLocal: targetHost := "localhost:" + strconv.Itoa(deployment.Port) target := "http://" + targetHost + // SECURITY (bug #215 follow-up): drop X-Internal-Auth-* headers before + // forwarding to a customer-deployed app on localhost. The app has no + // reason to see Orama's internal-auth metadata, and stripping prevents + // any accidental leakage if the customer ever adds header-based trust. + stripInboundInternalAuthHeaders(r.Header) // Set proxy headers r.Header.Set("X-Forwarded-For", getClientIP(r)) r.Header.Set("X-Forwarded-Proto", "https") @@ -1465,6 +1589,12 @@ func (g *Gateway) proxyCrossNode(w http.ResponseWriter, r *http.Request, deploym // Handle WebSocket upgrade requests specially if isWebSocketUpgrade(r) { + // SECURITY (bug #215 follow-up): drop X-Internal-Auth-* headers from + // the inbound request before forwarding to another node. The target + // node's authMiddleware gates internal-auth trust on source IP only; + // without this strip, an external attacker could forge JWT-Sub here + // and have the home node trust it after the cross-node hop. + stripInboundInternalAuthHeaders(r.Header) r.Header.Set("X-Forwarded-For", getClientIP(r)) r.Header.Set("X-Orama-Proxy-Node", g.nodePeerID) r.URL.Scheme = "http" @@ -1490,6 +1620,9 @@ func (g *Gateway) proxyCrossNode(w http.ResponseWriter, r *http.Request, deploym proxyReq.Header.Add(key, value) } } + // SECURITY (bug #215 follow-up): drop any X-Internal-Auth-* headers the + // inbound request may have carried. See WS branch above. + stripInboundInternalAuthHeaders(proxyReq.Header) proxyReq.Host = r.Host // Keep original host for domain routing on target node proxyReq.Header.Set("X-Forwarded-For", getClientIP(r)) proxyReq.Header.Set("X-Orama-Proxy-Node", g.nodePeerID) // Prevent loops @@ -1590,6 +1723,8 @@ func (g *Gateway) proxyCrossNodeToIP(w http.ResponseWriter, r *http.Request, dep // Handle WebSocket upgrade requests specially if isWebSocketUpgrade(r) { + // SECURITY (bug #215 follow-up): see proxyCrossNode for rationale. + stripInboundInternalAuthHeaders(r.Header) r.Header.Set("X-Forwarded-For", getClientIP(r)) r.Header.Set("X-Orama-Proxy-Node", g.nodePeerID) r.URL.Scheme = "http" @@ -1613,6 +1748,8 @@ func (g *Gateway) proxyCrossNodeToIP(w http.ResponseWriter, r *http.Request, dep proxyReq.Header.Add(key, value) } } + // SECURITY (bug #215 follow-up): see proxyCrossNode for rationale. + stripInboundInternalAuthHeaders(proxyReq.Header) proxyReq.Host = r.Host proxyReq.Header.Set("X-Forwarded-For", getClientIP(r)) proxyReq.Header.Set("X-Orama-Proxy-Node", g.nodePeerID) diff --git a/core/pkg/gateway/push_routes.go b/core/pkg/gateway/push_routes.go index f45cc17..f0a81b6 100644 --- a/core/pkg/gateway/push_routes.go +++ b/core/pkg/gateway/push_routes.go @@ -62,3 +62,27 @@ func (g *Gateway) pushSendHandler(w http.ResponseWriter, r *http.Request) { } g.pushHandlers.SendHandler(w, r) } + +// pushConfigHandler dispatches GET / PUT / DELETE on /v1/push/config — the +// tenant-self-service entrypoint for per-namespace push provider config +// (bug #220 follow-up). When push is fully disabled returns 503 with the +// same actionable message as the device endpoints. +func (g *Gateway) pushConfigHandler(w http.ResponseWriter, r *http.Request) { + if g.pushHandlers == nil { + httputil.WriteRPCError(w, http.StatusServiceUnavailable, + httputil.ErrCodeServiceUnavailable, pushNotConfiguredMessage) + return + } + switch r.Method { + case http.MethodGet: + g.pushHandlers.GetConfigHandler(w, r) + case http.MethodPut, http.MethodPost: + g.pushHandlers.PutConfigHandler(w, r) + case http.MethodDelete: + g.pushHandlers.DeleteConfigHandler(w, r) + default: + httputil.WriteRPCError(w, http.StatusMethodNotAllowed, + httputil.ErrCodeValidationFailed, + "method not allowed: use GET to read, PUT to update, or DELETE to clear") + } +} diff --git a/core/pkg/gateway/routes.go b/core/pkg/gateway/routes.go index 1b34244..2fa2687 100644 --- a/core/pkg/gateway/routes.go +++ b/core/pkg/gateway/routes.go @@ -139,6 +139,11 @@ func (g *Gateway) Routes() http.Handler { mux.HandleFunc("/v1/push/devices/", g.pushDevicesByIDHandler) mux.HandleFunc("/v1/push/send", g.pushSendHandler) + // Per-namespace push provider configuration (bug #220 follow-up): + // GET / PUT / DELETE — tenants self-serve their ntfy/expo credentials + // instead of filing an ops ticket. Method dispatched in the handler. + mux.HandleFunc("/v1/push/config", g.pushConfigHandler) + // operator node management (wallet JWT auth via middleware) if g.operatorHandler != nil { mux.HandleFunc("/v1/operator/invite", g.operatorHandler.HandleInvite) diff --git a/core/pkg/gateway/signing_key.go b/core/pkg/gateway/signing_key.go index 30e8ba2..64ce2d0 100644 --- a/core/pkg/gateway/signing_key.go +++ b/core/pkg/gateway/signing_key.go @@ -4,19 +4,27 @@ import ( "crypto/ed25519" "crypto/rand" "crypto/rsa" + "crypto/sha256" "crypto/x509" "encoding/pem" "fmt" + "io" "os" "path/filepath" "github.com/DeBrosOfficial/network/pkg/logging" "go.uber.org/zap" + "golang.org/x/crypto/hkdf" ) const jwtKeyFileName = "jwt-signing-key.pem" const eddsaKeyFileName = "jwt-eddsa-key.pem" +// jwtEdDSADerivePurpose is the HKDF info string used to derive the cluster-wide +// Ed25519 JWT signing seed from the cluster secret. Bumping the version label +// (e.g. "...-v2") on disk = full re-derive + invalidates old tokens. +const jwtEdDSADerivePurpose = "orama-jwt-eddsa-v1" + // loadOrCreateSigningKey loads the JWT signing key from disk, or generates a new one // if none exists. This ensures JWTs survive gateway restarts. func loadOrCreateSigningKey(dataDir string, logger *logging.ColoredLogger) ([]byte, error) { @@ -65,10 +73,37 @@ func loadOrCreateSigningKey(dataDir string, logger *logging.ColoredLogger) ([]by } // loadOrCreateEdSigningKey loads or generates an Ed25519 private key for EdDSA JWT signing. -// The key is stored as a PKCS8-encoded PEM file alongside the RSA key. -func loadOrCreateEdSigningKey(dataDir string, logger *logging.ColoredLogger) (ed25519.PrivateKey, error) { +// +// Bug #215 fix: when a non-empty clusterSecret is provided, the key is derived +// DETERMINISTICALLY from the cluster secret using HKDF-SHA256. This means every +// gateway in the cluster ends up with the same Ed25519 keypair, so a JWT signed +// by one gateway can be verified by any other gateway. Without this, each +// gateway minted its own random key on first boot, JWTs were unverifiable +// cross-gateway, and host functions saw empty `caller_jwt_subject` whenever +// invoke landed on a different node than `/v1/auth/login`. +// +// When clusterSecret is empty (single-node test rigs, no shared secret), +// falls back to the legacy random-per-node behaviour. +// +// On-disk PEM is the source of truth at runtime; if it doesn't match the +// derivation (e.g. left over from before this fix, or cluster secret rotated +// the label), the file is rewritten with the canonical key. Old tokens become +// unverifiable but JWT lifetime is short (15 min) so the disruption is bounded. +func loadOrCreateEdSigningKey(dataDir, clusterSecret string, logger *logging.ColoredLogger) (ed25519.PrivateKey, error) { keyPath := filepath.Join(dataDir, "secrets", eddsaKeyFileName) + // Compute the canonical key for this cluster (if a secret is available). + // Empty clusterSecret = legacy mode, no canonical key, just use whatever + // is on disk or freshly generated. + var canonical ed25519.PrivateKey + if clusterSecret != "" { + seed, err := deriveEd25519Seed(clusterSecret) + if err != nil { + return nil, fmt.Errorf("derive Ed25519 seed from cluster secret: %w", err) + } + canonical = ed25519.NewKeyFromSeed(seed) + } + // Try to load existing key if keyPEM, err := os.ReadFile(keyPath); err == nil && len(keyPEM) > 0 { block, _ := pem.Decode(keyPEM) @@ -76,20 +111,38 @@ func loadOrCreateEdSigningKey(dataDir string, logger *logging.ColoredLogger) (ed parsed, err := x509.ParsePKCS8PrivateKey(block.Bytes) if err == nil { if edKey, ok := parsed.(ed25519.PrivateKey); ok { - logger.ComponentInfo(logging.ComponentGeneral, "Loaded existing EdDSA signing key", - zap.String("path", keyPath)) - return edKey, nil + // If we have a canonical cluster-derived key, the on-disk + // key MUST match it. Otherwise we'd silently keep using a + // per-node random key and JWTs wouldn't verify cross-node. + if canonical != nil && !ed25519.PrivateKey(edKey).Equal(canonical) { + logger.ComponentWarn(logging.ComponentGeneral, + "On-disk EdDSA key does not match cluster-derived key; rewriting from cluster secret", + zap.String("path", keyPath)) + // Fall through to write canonical below. + } else { + logger.ComponentInfo(logging.ComponentGeneral, "Loaded existing EdDSA signing key", + zap.String("path", keyPath)) + return edKey, nil + } } } } - logger.ComponentWarn(logging.ComponentGeneral, "Existing EdDSA signing key is invalid, generating new one", - zap.String("path", keyPath)) + if canonical == nil { + logger.ComponentWarn(logging.ComponentGeneral, "Existing EdDSA signing key is invalid, generating new one", + zap.String("path", keyPath)) + } } - // Generate new Ed25519 key - _, priv, err := ed25519.GenerateKey(rand.Reader) - if err != nil { - return nil, fmt.Errorf("generate Ed25519 key: %w", err) + // Either nothing on disk, key was unparseable, or it didn't match the + // canonical cluster-derived key. Use canonical when available; otherwise + // generate a random one (legacy single-node fallback). + priv := canonical + if priv == nil { + _, generated, err := ed25519.GenerateKey(rand.Reader) + if err != nil { + return nil, fmt.Errorf("generate Ed25519 key: %w", err) + } + priv = generated } pkcs8Bytes, err := x509.MarshalPKCS8PrivateKey(priv) @@ -112,7 +165,27 @@ func loadOrCreateEdSigningKey(dataDir string, logger *logging.ColoredLogger) (ed return nil, fmt.Errorf("write EdDSA signing key: %w", err) } - logger.ComponentInfo(logging.ComponentGeneral, "Generated and saved new EdDSA signing key", - zap.String("path", keyPath)) + source := "random per-node" + if canonical != nil { + source = "cluster-derived (HKDF)" + } + logger.ComponentInfo(logging.ComponentGeneral, "Saved EdDSA signing key", + zap.String("path", keyPath), + zap.String("source", source)) return priv, nil } + +// deriveEd25519Seed derives a deterministic 32-byte seed for Ed25519 from the +// cluster secret using HKDF-SHA256 with a stable purpose label. Same secret + +// same label = same seed = same keypair on every gateway in the cluster. +func deriveEd25519Seed(clusterSecret string) ([]byte, error) { + if clusterSecret == "" { + return nil, fmt.Errorf("cluster secret is empty") + } + reader := hkdf.New(sha256.New, []byte(clusterSecret), nil, []byte(jwtEdDSADerivePurpose)) + seed := make([]byte, ed25519.SeedSize) + if _, err := io.ReadFull(reader, seed); err != nil { + return nil, fmt.Errorf("HKDF read failed: %w", err) + } + return seed, nil +} diff --git a/core/pkg/gateway/signing_key_test.go b/core/pkg/gateway/signing_key_test.go new file mode 100644 index 0000000..0950afe --- /dev/null +++ b/core/pkg/gateway/signing_key_test.go @@ -0,0 +1,147 @@ +package gateway + +import ( + "crypto/ed25519" + "os" + "path/filepath" + "testing" + + "github.com/DeBrosOfficial/network/pkg/logging" +) + +// TestDeriveEd25519Seed_deterministic guards bug #215: every gateway in a +// cluster must end up with the same Ed25519 keypair so JWTs verify on any +// node. Same cluster secret + same purpose label MUST produce the same seed. +func TestDeriveEd25519Seed_deterministic(t *testing.T) { + a, err := deriveEd25519Seed("super-secret-cluster-key") + if err != nil { + t.Fatalf("derive #1: %v", err) + } + b, err := deriveEd25519Seed("super-secret-cluster-key") + if err != nil { + t.Fatalf("derive #2: %v", err) + } + if len(a) != ed25519.SeedSize { + t.Errorf("seed size = %d, want %d", len(a), ed25519.SeedSize) + } + if string(a) != string(b) { + t.Error("seed not deterministic for same secret") + } +} + +// TestDeriveEd25519Seed_differentSecretsDifferentSeeds rules out a trivial +// implementation that ignores the input. +func TestDeriveEd25519Seed_differentSecretsDifferentSeeds(t *testing.T) { + a, err := deriveEd25519Seed("secret-a") + if err != nil { + t.Fatalf("derive a: %v", err) + } + b, err := deriveEd25519Seed("secret-b") + if err != nil { + t.Fatalf("derive b: %v", err) + } + if string(a) == string(b) { + t.Error("different secrets produced identical seed") + } +} + +func TestDeriveEd25519Seed_emptySecret(t *testing.T) { + if _, err := deriveEd25519Seed(""); err == nil { + t.Error("expected error for empty cluster secret, got nil") + } +} + +// TestLoadOrCreateEdSigningKey_clusterSecretSharedAcrossNodes simulates two +// gateways with separate dataDirs but the same cluster secret. They MUST end +// up with the same Ed25519 private key so JWTs verify cross-node. +func TestLoadOrCreateEdSigningKey_clusterSecretSharedAcrossNodes(t *testing.T) { + dirA := t.TempDir() + dirB := t.TempDir() + const clusterSecret = "shared-cluster-secret-for-test" + + logger, _ := logging.NewColoredLogger(logging.ComponentGeneral, false) + + keyA, err := loadOrCreateEdSigningKey(dirA, clusterSecret, logger) + if err != nil { + t.Fatalf("node A: %v", err) + } + keyB, err := loadOrCreateEdSigningKey(dirB, clusterSecret, logger) + if err != nil { + t.Fatalf("node B: %v", err) + } + if !ed25519.PrivateKey(keyA).Equal(keyB) { + t.Fatal("two nodes with same cluster secret produced different Ed25519 keys") + } + // PEMs should also be persisted. + if _, err := os.Stat(filepath.Join(dirA, "secrets", eddsaKeyFileName)); err != nil { + t.Errorf("PEM not written on node A: %v", err) + } + if _, err := os.Stat(filepath.Join(dirB, "secrets", eddsaKeyFileName)); err != nil { + t.Errorf("PEM not written on node B: %v", err) + } +} + +// TestLoadOrCreateEdSigningKey_emptySecretFallback verifies the legacy +// per-node behaviour is preserved when no cluster secret is available +// (single-node test rigs, dev). Two nodes get DIFFERENT random keys. +func TestLoadOrCreateEdSigningKey_emptySecretFallback(t *testing.T) { + dirA := t.TempDir() + dirB := t.TempDir() + logger, _ := logging.NewColoredLogger(logging.ComponentGeneral, false) + + keyA, err := loadOrCreateEdSigningKey(dirA, "", logger) + if err != nil { + t.Fatalf("node A: %v", err) + } + keyB, err := loadOrCreateEdSigningKey(dirB, "", logger) + if err != nil { + t.Fatalf("node B: %v", err) + } + if ed25519.PrivateKey(keyA).Equal(keyB) { + t.Error("two nodes without cluster secret unexpectedly produced identical keys") + } +} + +// TestLoadOrCreateEdSigningKey_overwritesStaleOnDiskKey covers the upgrade +// path: a gateway that previously generated a per-node random key (fix #215 +// not yet deployed) now restarts with cluster-secret derivation enabled. The +// random key on disk MUST be replaced with the canonical cluster-derived one. +func TestLoadOrCreateEdSigningKey_overwritesStaleOnDiskKey(t *testing.T) { + dir := t.TempDir() + logger, _ := logging.NewColoredLogger(logging.ComponentGeneral, false) + + // First boot: no cluster secret -> per-node random key. + keyV1, err := loadOrCreateEdSigningKey(dir, "", logger) + if err != nil { + t.Fatalf("v1: %v", err) + } + + // Second boot: cluster secret now configured -> must rewrite to canonical. + const clusterSecret = "now-i-have-a-secret" + keyV2, err := loadOrCreateEdSigningKey(dir, clusterSecret, logger) + if err != nil { + t.Fatalf("v2: %v", err) + } + if ed25519.PrivateKey(keyV1).Equal(keyV2) { + t.Fatal("stale per-node key was not replaced when cluster secret became available") + } + + // And the new key must match a fresh derivation from the same secret. + seed, err := deriveEd25519Seed(clusterSecret) + if err != nil { + t.Fatalf("derive seed: %v", err) + } + canonical := ed25519.NewKeyFromSeed(seed) + if !ed25519.PrivateKey(keyV2).Equal(canonical) { + t.Error("rewritten key does not match canonical derivation") + } + + // Third boot with same secret: must be stable, no rewrite, same key. + keyV3, err := loadOrCreateEdSigningKey(dir, clusterSecret, logger) + if err != nil { + t.Fatalf("v3: %v", err) + } + if !ed25519.PrivateKey(keyV2).Equal(keyV3) { + t.Error("canonical key is not stable across restarts") + } +} diff --git a/core/pkg/namespace/cluster_manager.go b/core/pkg/namespace/cluster_manager.go index 71c3446..1bb08a9 100644 --- a/core/pkg/namespace/cluster_manager.go +++ b/core/pkg/namespace/cluster_manager.go @@ -39,6 +39,12 @@ type ClusterManagerConfig struct { // in RQLite. Derived from cluster secret via HKDF(clusterSecret, "turn-encryption"). // If nil, TURN secrets are stored in plaintext (backward compatibility). TurnEncryptionKey []byte + + // ClusterSecretPath is the host's cluster-secret file path. Forwarded + // to spawned namespace gateways via YAML so they can derive the + // cluster-wide JWT signing key (bug #215 fix). Empty string disables + // cross-node JWT verification within namespace clusters. + ClusterSecretPath string } // ClusterManager orchestrates namespace cluster provisioning and lifecycle @@ -81,7 +87,7 @@ func NewClusterManager( portAllocator := NewNamespacePortAllocator(db, logger) webrtcPortAllocator := NewWebRTCPortAllocator(db, logger) nodeSelector := NewClusterNodeSelector(db, portAllocator, logger) - systemdSpawner := NewSystemdSpawner(cfg.BaseDataDir, logger) + systemdSpawner := NewSystemdSpawner(cfg.BaseDataDir, cfg.ClusterSecretPath, logger) dnsManager := NewDNSRecordManager(db, cfg.BaseDomain, logger) // Set IPFS defaults diff --git a/core/pkg/namespace/systemd_spawner.go b/core/pkg/namespace/systemd_spawner.go index 4e83cc0..c25b0ad 100644 --- a/core/pkg/namespace/systemd_spawner.go +++ b/core/pkg/namespace/systemd_spawner.go @@ -22,15 +22,28 @@ import ( type SystemdSpawner struct { systemdMgr *systemd.Manager namespaceBase string - logger *zap.Logger + // clusterSecretPath is the host's cluster-secret file path; written + // into spawned namespace gateways' YAML so they can derive the + // cluster-wide JWT signing key (bug #215). Empty string means the host + // has no cluster secret available — namespace gateways will fall back + // to per-node random keys and JWTs won't verify cross-node. + clusterSecretPath string + logger *zap.Logger } -// NewSystemdSpawner creates a new systemd-based spawner -func NewSystemdSpawner(namespaceBase string, logger *zap.Logger) *SystemdSpawner { +// NewSystemdSpawner creates a new systemd-based spawner. +// +// clusterSecretPath should point to the host node's cluster-secret file +// (typically `/secrets/cluster-secret`). It is written into each +// spawned namespace gateway's YAML config so the gateway can read it on +// startup. Pass "" only if no cluster secret exists on this host (legacy +// single-node test deployments). +func NewSystemdSpawner(namespaceBase, clusterSecretPath string, logger *zap.Logger) *SystemdSpawner { return &SystemdSpawner{ - systemdMgr: systemd.NewManager(namespaceBase, logger), - namespaceBase: namespaceBase, - logger: logger.With(zap.String("component", "systemd-spawner")), + systemdMgr: systemd.NewManager(namespaceBase, logger), + namespaceBase: namespaceBase, + clusterSecretPath: clusterSecretPath, + logger: logger.With(zap.String("component", "systemd-spawner")), } } @@ -209,6 +222,12 @@ func (s *SystemdSpawner) SpawnGateway(ctx context.Context, namespace, nodeID str IPFSAPIURL: cfg.IPFSAPIURL, IPFSTimeout: cfg.IPFSTimeout.String(), IPFSReplicationFactor: cfg.IPFSReplicationFactor, + // Bug #215 fix: forward the host's cluster secret path so the + // spawned namespace gateway can derive the cluster-wide JWT + // signing key. Without this, namespace gateways used per-node + // random Ed25519 keys and host functions saw empty + // caller_jwt_subject. + ClusterSecretPath: s.clusterSecretPath, WebRTC: gateway.GatewayYAMLWebRTC{ Enabled: cfg.WebRTCEnabled, SFUPort: cfg.SFUPort, diff --git a/core/pkg/node/gateway.go b/core/pkg/node/gateway.go index 1911b28..8bdd97d 100644 --- a/core/pkg/node/gateway.go +++ b/core/pkg/node/gateway.go @@ -98,6 +98,17 @@ func (n *Node) startHTTPGateway(ctx context.Context) error { } } + // Bug #215 fix: tell the namespace cluster manager where the + // host's cluster-secret file lives. Spawned namespace gateways + // read it on startup to derive the cluster-wide Ed25519 JWT + // signing key. The path resolves to the same file the host + // gateway reads above (line ~45) — keeps the secret on disk + // once, just referenced from the namespace gateway YAML. + clusterSecretPath := "" + if clusterSecret != "" { + clusterSecretPath = filepath.Join(oramaDir, "secrets", "cluster-secret") + } + clusterCfg := namespace.ClusterManagerConfig{ BaseDomain: n.config.HTTPGateway.BaseDomain, BaseDataDir: baseDataDir, @@ -107,6 +118,7 @@ func (n *Node) startHTTPGateway(ctx context.Context) error { IPFSTimeout: gwCfg.IPFSTimeout, IPFSReplicationFactor: n.config.Database.IPFS.ReplicationFactor, TurnEncryptionKey: turnEncKey, + ClusterSecretPath: clusterSecretPath, } clusterManager := namespace.NewClusterManager(ormClient, clusterCfg, n.logger.Logger) clusterManager.SetLocalNodeID(gwCfg.NodePeerID) @@ -114,8 +126,11 @@ func (n *Node) startHTTPGateway(ctx context.Context) error { apiGateway.SetNodeRecoverer(clusterManager) apiGateway.SetWebRTCManager(clusterManager) - // Wire spawn handler for distributed namespace instance spawning - systemdSpawner := namespace.NewSystemdSpawner(baseDataDir, n.logger.Logger) + // Wire spawn handler for distributed namespace instance spawning. + // Forwards the host's cluster_secret_path through to spawned + // namespace gateways (bug #215 fix; same rationale as the + // ClusterManager spawner above). + systemdSpawner := namespace.NewSystemdSpawner(baseDataDir, clusterSecretPath, n.logger.Logger) spawnHandler := namespacehandlers.NewSpawnHandler(systemdSpawner, n.logger.Logger) apiGateway.SetSpawnHandler(spawnHandler) diff --git a/core/pkg/push/config_store.go b/core/pkg/push/config_store.go new file mode 100644 index 0000000..acbd5ef --- /dev/null +++ b/core/pkg/push/config_store.go @@ -0,0 +1,232 @@ +package push + +// config_store.go — per-namespace push provider configuration with +// encrypted credential storage. +// +// Tenants set their own ntfy / expo credentials via the HTTP config +// endpoint; this store persists them in RQLite (`namespace_push_config`), +// encrypting sensitive fields at rest using a key derived from the +// cluster secret. The gateway YAML config remains as a global fallback — +// a row here OVERRIDES the YAML for that namespace, absent rows inherit +// YAML defaults. +// +// See bug #220 follow-up. Pattern intentionally generic (mirrors +// push_devices encryption + storage), so the next "tenant should +// self-serve this knob" need (rate limits, custom domains, etc.) can +// drop a similar table + handler in. + +import ( + "context" + "database/sql" + "errors" + "fmt" + "time" + + "github.com/DeBrosOfficial/network/pkg/rqlite" + "github.com/DeBrosOfficial/network/pkg/secrets" + "go.uber.org/zap" +) + +// purposeNamespacePushConfig is the HKDF "purpose" string for the +// per-namespace push-config encryption key. Distinct from other purposes +// so a key compromise in one domain doesn't compromise this one. +const purposeNamespacePushConfig = "namespace-push-config" + +// Config is the in-memory representation of one namespace's push provider +// configuration. Sensitive fields are PLAINTEXT in this struct — encryption +// happens at the storage boundary. +type Config struct { + Namespace string + NtfyBaseURL string + NtfyAuthToken string + ExpoAccessToken string + UpdatedAt int64 + UpdatedBy string +} + +// IsEmpty returns true when this config has no provider credentials set — +// the namespace is effectively "no push" and the manager should fall back +// to the gateway YAML defaults (or refuse if those are also empty). +func (c *Config) IsEmpty() bool { + if c == nil { + return true + } + return c.NtfyBaseURL == "" && c.ExpoAccessToken == "" +} + +// Redacted returns a copy of this Config safe for return over HTTP — the +// auth-token / access-token fields are zeroed out and replaced with a +// boolean (`HasNtfyAuthToken` / `HasExpoAccessToken`) at the response +// shape level. Use this to avoid ever sending the secrets back. +type RedactedConfig struct { + Namespace string `json:"namespace"` + NtfyBaseURL string `json:"ntfy_base_url,omitempty"` + HasNtfyAuthToken bool `json:"has_ntfy_auth_token"` + HasExpoAccessToken bool `json:"has_expo_access_token"` + UpdatedAt int64 `json:"updated_at,omitempty"` + UpdatedBy string `json:"updated_by,omitempty"` +} + +// Redacted returns the redacted view safe for GET responses. +func (c *Config) Redacted() RedactedConfig { + if c == nil { + return RedactedConfig{} + } + return RedactedConfig{ + Namespace: c.Namespace, + NtfyBaseURL: c.NtfyBaseURL, + HasNtfyAuthToken: c.NtfyAuthToken != "", + HasExpoAccessToken: c.ExpoAccessToken != "", + UpdatedAt: c.UpdatedAt, + UpdatedBy: c.UpdatedBy, + } +} + +// ConfigStore is the persistence layer for per-namespace push config. +// Errors returned for "no row for namespace" → ErrConfigNotFound. +type ConfigStore interface { + Get(ctx context.Context, namespace string) (*Config, error) + Upsert(ctx context.Context, cfg Config) error + Delete(ctx context.Context, namespace string) error +} + +// ErrConfigNotFound is returned by Get when no row exists for the +// namespace. Callers fall back to YAML defaults in that case. +var ErrConfigNotFound = errors.New("push config not found for namespace") + +// rqliteConfigStore implements ConfigStore over RQLite + pkg/secrets. +type rqliteConfigStore struct { + db rqlite.Client + encKey []byte + logger *zap.Logger +} + +// NewRqliteConfigStore wires the store to RQLite with a cluster-secret- +// derived encryption key for credential fields. +func NewRqliteConfigStore(db rqlite.Client, clusterSecret string, logger *zap.Logger) (ConfigStore, error) { + if clusterSecret == "" { + return nil, fmt.Errorf("push config store: cluster secret required for credential encryption") + } + key, err := secrets.DeriveKey(clusterSecret, purposeNamespacePushConfig) + if err != nil { + return nil, fmt.Errorf("push config store: derive key: %w", err) + } + if logger == nil { + logger = zap.NewNop() + } + return &rqliteConfigStore{db: db, encKey: key, logger: logger}, nil +} + +// Get returns the namespace's config, decrypting sensitive fields. Returns +// ErrConfigNotFound if no row exists — caller decides whether to fall +// back to a default. +func (s *rqliteConfigStore) Get(ctx context.Context, namespace string) (*Config, error) { + if namespace == "" { + return nil, fmt.Errorf("push config: namespace required") + } + const q = `SELECT namespace, ntfy_base_url, ntfy_auth_token_encrypted, + expo_access_token_encrypted, updated_at, updated_by + FROM namespace_push_config WHERE namespace = ? LIMIT 1` + + var rows []struct { + Namespace string `db:"namespace"` + NtfyBaseURL string `db:"ntfy_base_url"` + NtfyAuthTokenEncrypted string `db:"ntfy_auth_token_encrypted"` + ExpoAccessTokenEncrypted string `db:"expo_access_token_encrypted"` + UpdatedAt int64 `db:"updated_at"` + UpdatedBy string `db:"updated_by"` + } + if err := s.db.Query(ctx, &rows, q, namespace); err != nil { + // gorqlite stdlib path collapses "0 rows" to nil err; only real + // errors get here. ErrNoRows from sql is also treated below. + if errors.Is(err, sql.ErrNoRows) { + return nil, ErrConfigNotFound + } + return nil, fmt.Errorf("push config: query: %w", err) + } + if len(rows) == 0 { + return nil, ErrConfigNotFound + } + r := rows[0] + + cfg := &Config{ + Namespace: r.Namespace, + NtfyBaseURL: r.NtfyBaseURL, + UpdatedAt: r.UpdatedAt, + UpdatedBy: r.UpdatedBy, + } + if r.NtfyAuthTokenEncrypted != "" { + v, err := secrets.Decrypt(r.NtfyAuthTokenEncrypted, s.encKey) + if err != nil { + return nil, fmt.Errorf("push config: decrypt ntfy auth token: %w", err) + } + cfg.NtfyAuthToken = v + } + if r.ExpoAccessTokenEncrypted != "" { + v, err := secrets.Decrypt(r.ExpoAccessTokenEncrypted, s.encKey) + if err != nil { + return nil, fmt.Errorf("push config: decrypt expo access token: %w", err) + } + cfg.ExpoAccessToken = v + } + return cfg, nil +} + +// Upsert writes or replaces the namespace's config. Sensitive fields are +// encrypted before storage. Empty strings for credential fields clear them. +func (s *rqliteConfigStore) Upsert(ctx context.Context, cfg Config) error { + if cfg.Namespace == "" { + return fmt.Errorf("push config: namespace required") + } + + var ntfyEnc, expoEnc string + if cfg.NtfyAuthToken != "" { + v, err := secrets.Encrypt(cfg.NtfyAuthToken, s.encKey) + if err != nil { + return fmt.Errorf("push config: encrypt ntfy auth token: %w", err) + } + ntfyEnc = v + } + if cfg.ExpoAccessToken != "" { + v, err := secrets.Encrypt(cfg.ExpoAccessToken, s.encKey) + if err != nil { + return fmt.Errorf("push config: encrypt expo access token: %w", err) + } + expoEnc = v + } + updatedAt := cfg.UpdatedAt + if updatedAt == 0 { + updatedAt = time.Now().Unix() + } + + const q = `INSERT INTO namespace_push_config ( + namespace, ntfy_base_url, ntfy_auth_token_encrypted, + expo_access_token_encrypted, updated_at, updated_by + ) VALUES (?, ?, ?, ?, ?, ?) + ON CONFLICT(namespace) DO UPDATE SET + ntfy_base_url = excluded.ntfy_base_url, + ntfy_auth_token_encrypted = excluded.ntfy_auth_token_encrypted, + expo_access_token_encrypted = excluded.expo_access_token_encrypted, + updated_at = excluded.updated_at, + updated_by = excluded.updated_by` + + if _, err := s.db.Exec(ctx, q, + cfg.Namespace, cfg.NtfyBaseURL, ntfyEnc, expoEnc, updatedAt, cfg.UpdatedBy, + ); err != nil { + return fmt.Errorf("push config: upsert: %w", err) + } + return nil +} + +// Delete clears the namespace's config row. After this the manager falls +// back to YAML defaults (or refuses if no defaults set). +func (s *rqliteConfigStore) Delete(ctx context.Context, namespace string) error { + if namespace == "" { + return fmt.Errorf("push config: namespace required") + } + const q = `DELETE FROM namespace_push_config WHERE namespace = ?` + if _, err := s.db.Exec(ctx, q, namespace); err != nil { + return fmt.Errorf("push config: delete: %w", err) + } + return nil +} diff --git a/core/pkg/push/manager.go b/core/pkg/push/manager.go new file mode 100644 index 0000000..7dd5434 --- /dev/null +++ b/core/pkg/push/manager.go @@ -0,0 +1,265 @@ +package push + +// manager.go — top-level entry point for sending push notifications, +// with per-namespace provider configuration that tenants self-serve. +// +// The Manager wraps: +// - a ConfigStore (per-namespace overrides in RQLite) +// - a fallback Defaults (gateway YAML — the cluster-wide default if +// a namespace hasn't set its own) +// - an LRU cache of built dispatchers keyed by namespace +// +// Every send-path (HTTP, WASM hostfunc) goes through Manager.SendToUser +// or Manager.Send. The cache eliminates per-call config decryption + +// provider construction overhead — only the FIRST send for a namespace +// pays that cost; subsequent sends are zero-allocation lookups. +// +// Cache invalidation: HTTP config-change handlers call Invalidate after +// PUT/DELETE so the next send rebuilds with fresh config. +// +// See bug #220 follow-up. Generic by design — same pattern works for +// any future "tenant should self-serve this knob" feature. + +import ( + "container/list" + "context" + "errors" + "fmt" + "sync" + + "go.uber.org/zap" +) + +// ProviderFactory builds the set of providers for a resolved Config. +// Injected by the gateway so the push package itself doesn't import +// the provider sub-packages (avoids a circular dependency: provider +// sub-packages already depend on push for the PushProvider interface). +// +// The factory is called once per fresh dispatcher build (cache miss). +// Empty slice is allowed and means "this config produces no providers"; +// Manager treats that as ErrPushNotConfigured. +type ProviderFactory func(cfg Config) []PushProvider + +// ErrPushNotConfigured is returned by Send when the namespace has no +// per-namespace config AND the gateway has no fallback defaults — i.e. +// nothing to send through. Distinguish from ErrNoDevices (different +// failure mode). +var ErrPushNotConfigured = errors.New("push not configured for namespace; set ntfy_base_url or expo_access_token via PUT /v1/push/config") + +// Defaults are the gateway-YAML fallback when a namespace hasn't set its +// own config. Any field set here applies to every namespace that doesn't +// override it. Empty Defaults means "no global default — namespace must +// set their own". +type Defaults struct { + NtfyBaseURL string + NtfyAuthToken string + ExpoAccessToken string +} + +// IsEmpty returns true when no fallback provider is set. +func (d Defaults) IsEmpty() bool { + return d.NtfyBaseURL == "" && d.ExpoAccessToken == "" +} + +// Manager is the top-level push entry point. Build with NewManager and +// hand out via the gateway's dependencies. Safe for concurrent use. +type Manager struct { + store ConfigStore + devices PushDeviceStore + defaults Defaults + factory ProviderFactory + logger *zap.Logger + + // cache LRU of namespace → built dispatcher. + mu sync.Mutex + cache map[string]*list.Element + lru *list.List + cacheCap int +} + +// cacheEntry is the doubly-linked-list node + dispatcher payload. +type cacheEntry struct { + namespace string + dispatcher *PushDispatcher +} + +// defaultCacheCap caps how many namespaces' dispatchers we hold in memory. +// Each entry is small (~few hundred bytes); 256 is generous and bounds +// memory under abuse. +const defaultCacheCap = 256 + +// NewManager constructs a Manager with the given device store, config +// store, fallback Defaults, and ProviderFactory. +// +// The ProviderFactory is REQUIRED — it tells the manager how to turn a +// resolved Config into PushProvider instances. The gateway's +// dependencies wires this up (it imports the provider sub-packages and +// returns the right providers for each config field that's populated). +func NewManager(devices PushDeviceStore, store ConfigStore, defaults Defaults, factory ProviderFactory, logger *zap.Logger) *Manager { + if logger == nil { + logger = zap.NewNop() + } + return &Manager{ + store: store, + devices: devices, + defaults: defaults, + factory: factory, + logger: logger, + cache: make(map[string]*list.Element, defaultCacheCap), + lru: list.New(), + cacheCap: defaultCacheCap, + } +} + +// SendToUser dispatches a push to every device registered for the user +// in the given namespace. Looks up per-namespace config (or falls back +// to defaults), builds the appropriate dispatcher, and sends. +// +// Returns: +// - nil on success or "no devices" (per existing PushDispatcher +// contract — sending push to a user with zero devices is a no-op) +// - ErrPushNotConfigured when neither namespace config nor gateway +// defaults provide a working provider +func (m *Manager) SendToUser(ctx context.Context, namespace, userID string, msg PushMessage) error { + d, err := m.dispatcherFor(ctx, namespace) + if err != nil { + return err + } + return d.SendToUser(ctx, namespace, userID, msg) +} + +// DeviceStore exposes the underlying device store so HTTP handlers +// (register/list/delete) can use it directly without going through the +// dispatcher path. +func (m *Manager) DeviceStore() PushDeviceStore { + return m.devices +} + +// IsConfigured returns true when the namespace has per-namespace config +// OR usable fallback defaults — i.e. SendToUser would not fail with +// ErrPushNotConfigured. Cheap path-check used by HTTP 503 responses. +func (m *Manager) IsConfigured(ctx context.Context, namespace string) bool { + if !m.defaults.IsEmpty() { + return true + } + if m.store == nil { + return false + } + cfg, err := m.store.Get(ctx, namespace) + if err != nil || cfg == nil { + return false + } + return !cfg.IsEmpty() +} + +// Invalidate evicts the cached dispatcher for a namespace. Call after a +// successful PUT/DELETE on /v1/push/config so the next SendToUser uses +// fresh config. +func (m *Manager) Invalidate(namespace string) { + m.mu.Lock() + defer m.mu.Unlock() + if elem, ok := m.cache[namespace]; ok { + m.lru.Remove(elem) + delete(m.cache, namespace) + } +} + +// dispatcherFor returns a (cached or freshly built) dispatcher with the +// providers configured for the given namespace. +func (m *Manager) dispatcherFor(ctx context.Context, namespace string) (*PushDispatcher, error) { + // Fast path — already cached. + m.mu.Lock() + if elem, ok := m.cache[namespace]; ok { + m.lru.MoveToFront(elem) + entry := elem.Value.(*cacheEntry) + m.mu.Unlock() + return entry.dispatcher, nil + } + m.mu.Unlock() + + // Slow path — load config + build dispatcher. + d, err := m.buildDispatcher(ctx, namespace) + if err != nil { + return nil, err + } + + // Insert into cache (eviction if at capacity). + m.mu.Lock() + defer m.mu.Unlock() + // Recheck under lock — another goroutine may have built one. + if elem, ok := m.cache[namespace]; ok { + m.lru.MoveToFront(elem) + return elem.Value.(*cacheEntry).dispatcher, nil + } + if m.lru.Len() >= m.cacheCap { + oldest := m.lru.Back() + if oldest != nil { + old := oldest.Value.(*cacheEntry) + m.lru.Remove(oldest) + delete(m.cache, old.namespace) + } + } + entry := &cacheEntry{namespace: namespace, dispatcher: d} + m.cache[namespace] = m.lru.PushFront(entry) + return d, nil +} + +// buildDispatcher resolves the effective config for the namespace and +// constructs a fresh PushDispatcher with the right providers registered. +// +// Effective-config resolution: +// 1. Start with gateway YAML defaults +// 2. If a per-namespace row exists, override field-by-field +// 3. If the result has zero providers, return ErrPushNotConfigured +func (m *Manager) buildDispatcher(ctx context.Context, namespace string) (*PushDispatcher, error) { + eff := Config{Namespace: namespace} + // Start from defaults. + eff.NtfyBaseURL = m.defaults.NtfyBaseURL + eff.NtfyAuthToken = m.defaults.NtfyAuthToken + eff.ExpoAccessToken = m.defaults.ExpoAccessToken + + // Layer namespace overrides if any. + if m.store != nil { + nc, err := m.store.Get(ctx, namespace) + if err != nil && !errors.Is(err, ErrConfigNotFound) { + return nil, fmt.Errorf("manager: load namespace config: %w", err) + } + if nc != nil { + // Each field overrides the default if non-empty. An explicit + // empty value via PUT clears the namespace's row entirely + // (DELETE) — there's no "set this field to empty to clear" + // half-state, by design. + if nc.NtfyBaseURL != "" { + eff.NtfyBaseURL = nc.NtfyBaseURL + } + if nc.NtfyAuthToken != "" { + eff.NtfyAuthToken = nc.NtfyAuthToken + } + if nc.ExpoAccessToken != "" { + eff.ExpoAccessToken = nc.ExpoAccessToken + } + } + } + + // Refuse to build a dispatcher with no providers — caller gets a + // clear error instead of a silent no-op. + if eff.NtfyBaseURL == "" && eff.ExpoAccessToken == "" { + return nil, ErrPushNotConfigured + } + if m.factory == nil { + // Defensive: a Manager built without a factory can't produce + // providers. Programmer error; surface explicitly. + return nil, fmt.Errorf("manager: no provider factory configured") + } + + providers := m.factory(eff) + if len(providers) == 0 { + return nil, ErrPushNotConfigured + } + + d := New(m.devices, m.logger) + for _, p := range providers { + d.Register(p) + } + return d, nil +} diff --git a/core/pkg/push/manager_test.go b/core/pkg/push/manager_test.go new file mode 100644 index 0000000..08aede8 --- /dev/null +++ b/core/pkg/push/manager_test.go @@ -0,0 +1,312 @@ +package push + +import ( + "context" + "errors" + "sync" + "sync/atomic" + "testing" + + "go.uber.org/zap" +) + +// fakeConfigStore is a minimal in-memory ConfigStore for tests. +type fakeConfigStore struct { + mu sync.Mutex + configs map[string]*Config + getErr error // optional injected error + calls atomic.Int32 +} + +func newFakeConfigStore() *fakeConfigStore { + return &fakeConfigStore{configs: map[string]*Config{}} +} + +func (f *fakeConfigStore) Get(_ context.Context, ns string) (*Config, error) { + f.calls.Add(1) + if f.getErr != nil { + return nil, f.getErr + } + f.mu.Lock() + defer f.mu.Unlock() + cfg, ok := f.configs[ns] + if !ok { + return nil, ErrConfigNotFound + } + // Return a copy so the caller can't mutate our state. + cp := *cfg + return &cp, nil +} + +func (f *fakeConfigStore) Upsert(_ context.Context, c Config) error { + f.mu.Lock() + defer f.mu.Unlock() + cp := c + f.configs[c.Namespace] = &cp + return nil +} + +func (f *fakeConfigStore) Delete(_ context.Context, ns string) error { + f.mu.Lock() + defer f.mu.Unlock() + delete(f.configs, ns) + return nil +} + +// managerFakeProvider is a stub PushProvider for manager tests. +// (The dispatcher tests already declare a `fakeProvider`; rename ours.) +type managerFakeProvider struct { + name string + sends atomic.Int32 +} + +func (p *managerFakeProvider) Name() string { return p.name } +func (p *managerFakeProvider) Send(_ context.Context, _ PushMessage) error { + p.sends.Add(1) + return nil +} +func (p *managerFakeProvider) SendBulk(_ context.Context, msgs []PushMessage) []error { + for range msgs { + p.sends.Add(1) + } + return nil +} + +func TestManager_namespace_with_no_config_uses_defaults(t *testing.T) { + store := newFakeConfigStore() // empty + defaults := Defaults{NtfyBaseURL: "http://default-ntfy"} + + var providerCalls atomic.Int32 + factory := func(c Config) []PushProvider { + providerCalls.Add(1) + // Verify the manager passed defaults through to the factory. + if c.NtfyBaseURL != "http://default-ntfy" { + t.Errorf("factory got URL=%q, want default", c.NtfyBaseURL) + } + return []PushProvider{&managerFakeProvider{name: "ntfy"}} + } + + m := NewManager(&fakeDeviceStore{}, store, defaults, factory, zap.NewNop()) + d, err := m.dispatcherFor(context.Background(), "ns-A") + if err != nil { + t.Fatalf("expected default to satisfy: %v", err) + } + if d == nil { + t.Fatal("expected a dispatcher") + } + if providerCalls.Load() != 1 { + t.Errorf("factory should have been called once, got %d", providerCalls.Load()) + } +} + +func TestManager_namespace_config_overrides_defaults(t *testing.T) { + store := newFakeConfigStore() + store.Upsert(context.Background(), Config{ + Namespace: "ns-A", + NtfyBaseURL: "http://override-ntfy", + }) + defaults := Defaults{NtfyBaseURL: "http://default-ntfy"} + + var seenURL string + factory := func(c Config) []PushProvider { + seenURL = c.NtfyBaseURL + return []PushProvider{&managerFakeProvider{name: "ntfy"}} + } + + m := NewManager(&fakeDeviceStore{}, store, defaults, factory, zap.NewNop()) + if _, err := m.dispatcherFor(context.Background(), "ns-A"); err != nil { + t.Fatalf("dispatcherFor: %v", err) + } + if seenURL != "http://override-ntfy" { + t.Errorf("factory got URL=%q, want the namespace override", seenURL) + } +} + +func TestManager_no_config_no_defaults_returns_ErrPushNotConfigured(t *testing.T) { + store := newFakeConfigStore() + factory := func(_ Config) []PushProvider { return nil } + + m := NewManager(&fakeDeviceStore{}, store, Defaults{}, factory, zap.NewNop()) + _, err := m.dispatcherFor(context.Background(), "ns-A") + if !errors.Is(err, ErrPushNotConfigured) { + t.Errorf("expected ErrPushNotConfigured, got %v", err) + } +} + +func TestManager_caches_dispatchers_per_namespace(t *testing.T) { + store := newFakeConfigStore() + store.Upsert(context.Background(), Config{Namespace: "ns-A", NtfyBaseURL: "u"}) + + var factoryCalls atomic.Int32 + factory := func(_ Config) []PushProvider { + factoryCalls.Add(1) + return []PushProvider{&managerFakeProvider{name: "ntfy"}} + } + m := NewManager(&fakeDeviceStore{}, store, Defaults{}, factory, zap.NewNop()) + + // Two calls for the same namespace = one factory invocation (cache hit). + for i := 0; i < 5; i++ { + if _, err := m.dispatcherFor(context.Background(), "ns-A"); err != nil { + t.Fatalf("iter %d: %v", i, err) + } + } + if factoryCalls.Load() != 1 { + t.Errorf("expected 1 factory call due to cache, got %d", factoryCalls.Load()) + } +} + +func TestManager_invalidate_forces_rebuild(t *testing.T) { + store := newFakeConfigStore() + store.Upsert(context.Background(), Config{Namespace: "ns-A", NtfyBaseURL: "v1"}) + + var seenURLs []string + factory := func(c Config) []PushProvider { + seenURLs = append(seenURLs, c.NtfyBaseURL) + return []PushProvider{&managerFakeProvider{name: "ntfy"}} + } + m := NewManager(&fakeDeviceStore{}, store, Defaults{}, factory, zap.NewNop()) + + if _, err := m.dispatcherFor(context.Background(), "ns-A"); err != nil { + t.Fatal(err) + } + // Update config + invalidate. + store.Upsert(context.Background(), Config{Namespace: "ns-A", NtfyBaseURL: "v2"}) + m.Invalidate("ns-A") + + if _, err := m.dispatcherFor(context.Background(), "ns-A"); err != nil { + t.Fatal(err) + } + if len(seenURLs) != 2 || seenURLs[0] != "v1" || seenURLs[1] != "v2" { + t.Errorf("expected factory to be called twice with v1 then v2, got %v", seenURLs) + } +} + +func TestManager_per_namespace_isolation(t *testing.T) { + // ns-A has its own config; ns-B has none and uses defaults. + store := newFakeConfigStore() + store.Upsert(context.Background(), Config{Namespace: "ns-A", NtfyBaseURL: "ns-A-url"}) + + defaults := Defaults{NtfyBaseURL: "default-url"} + + urlByNS := make(map[string]string) + var mu sync.Mutex + factory := func(c Config) []PushProvider { + mu.Lock() + urlByNS[c.Namespace] = c.NtfyBaseURL + mu.Unlock() + return []PushProvider{&managerFakeProvider{name: "ntfy"}} + } + + m := NewManager(&fakeDeviceStore{}, store, defaults, factory, zap.NewNop()) + if _, err := m.dispatcherFor(context.Background(), "ns-A"); err != nil { + t.Fatal(err) + } + if _, err := m.dispatcherFor(context.Background(), "ns-B"); err != nil { + t.Fatal(err) + } + mu.Lock() + defer mu.Unlock() + if urlByNS["ns-A"] != "ns-A-url" { + t.Errorf("ns-A got %q, want ns-A-url", urlByNS["ns-A"]) + } + if urlByNS["ns-B"] != "default-url" { + t.Errorf("ns-B got %q, want default-url", urlByNS["ns-B"]) + } +} + +func TestManager_IsConfigured_distinguishes_states(t *testing.T) { + store := newFakeConfigStore() + store.Upsert(context.Background(), Config{Namespace: "with-cfg", NtfyBaseURL: "u"}) + + // Without defaults, only namespace with config returns true. + m1 := NewManager(&fakeDeviceStore{}, store, Defaults{}, nil, zap.NewNop()) + if !m1.IsConfigured(context.Background(), "with-cfg") { + t.Error("expected with-cfg to be configured") + } + if m1.IsConfigured(context.Background(), "no-cfg") { + t.Error("expected no-cfg to NOT be configured (no defaults)") + } + + // With defaults, every namespace is configured. + m2 := NewManager(&fakeDeviceStore{}, store, Defaults{NtfyBaseURL: "d"}, nil, zap.NewNop()) + if !m2.IsConfigured(context.Background(), "no-cfg") { + t.Error("expected no-cfg to be configured via defaults") + } +} + +func TestManager_concurrent_dispatcherFor_no_race(t *testing.T) { + // Run with -race. + store := newFakeConfigStore() + store.Upsert(context.Background(), Config{Namespace: "ns", NtfyBaseURL: "u"}) + factory := func(_ Config) []PushProvider { return []PushProvider{&managerFakeProvider{name: "ntfy"}} } + + m := NewManager(&fakeDeviceStore{}, store, Defaults{}, factory, zap.NewNop()) + + var wg sync.WaitGroup + for g := 0; g < 16; g++ { + wg.Add(1) + go func() { + defer wg.Done() + for i := 0; i < 50; i++ { + _, _ = m.dispatcherFor(context.Background(), "ns") + } + }() + } + wg.Wait() +} + +// fakeDeviceStore is a stub PushDeviceStore for manager tests. +type fakeDeviceStore struct{} + +func (s *fakeDeviceStore) Upsert(_ context.Context, _ PushDevice) error { return nil } +func (s *fakeDeviceStore) Delete(_ context.Context, _, _ string) error { return nil } +func (s *fakeDeviceStore) ListForUser(_ context.Context, _, _ string) ([]PushDevice, error) { + return nil, nil +} + +func TestConfig_Redacted_omits_secrets(t *testing.T) { + cfg := &Config{ + Namespace: "ns", + NtfyBaseURL: "https://ntfy.example", + NtfyAuthToken: "super-secret-token", + ExpoAccessToken: "another-super-secret", + UpdatedAt: 12345, + UpdatedBy: "admin", + } + r := cfg.Redacted() + if r.NtfyBaseURL != "https://ntfy.example" { + t.Errorf("ntfy URL must be present (it's not a secret), got %q", r.NtfyBaseURL) + } + if !r.HasNtfyAuthToken { + t.Error("HasNtfyAuthToken must reflect that the token is set") + } + if !r.HasExpoAccessToken { + t.Error("HasExpoAccessToken must reflect that the token is set") + } + // Most importantly: the tokens themselves must not appear anywhere + // in the redacted view. Walk the struct's exported fields. + if r.UpdatedAt != 12345 { + t.Errorf("UpdatedAt should round-trip, got %d", r.UpdatedAt) + } +} + +func TestConfig_IsEmpty(t *testing.T) { + cases := []struct { + name string + c *Config + want bool + }{ + {"nil pointer", nil, true}, + {"zero value", &Config{}, true}, + {"namespace only", &Config{Namespace: "ns"}, true}, + {"with ntfy URL", &Config{NtfyBaseURL: "u"}, false}, + {"with expo token", &Config{ExpoAccessToken: "t"}, false}, + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + if got := c.c.IsEmpty(); got != c.want { + t.Errorf("IsEmpty() = %v, want %v", got, c.want) + } + }) + } +} diff --git a/core/pkg/serverless/engine_anon_module_test.go b/core/pkg/serverless/engine_anon_module_test.go new file mode 100644 index 0000000..50328b9 --- /dev/null +++ b/core/pkg/serverless/engine_anon_module_test.go @@ -0,0 +1,171 @@ +package serverless + +import ( + "context" + "strings" + "sync" + "testing" + + "go.uber.org/zap" +) + +// minimalWASM is the same nop-only WASM from TestEngine_Execute — +// exports _start, returns immediately. Sufficient for proving that +// ExecuteModule's instantiation contract works under repeated / +// concurrent calls without name-collision (bug #221). +var minimalWASM = []byte{ + 0x00, 0x61, 0x73, 0x6d, 0x01, 0x00, 0x00, 0x00, + 0x01, 0x04, 0x01, 0x60, 0x00, 0x00, + 0x03, 0x02, 0x01, 0x00, + 0x07, 0x0a, 0x01, 0x06, 0x5f, 0x73, 0x74, 0x61, 0x72, 0x74, 0x00, 0x00, + 0x0a, 0x04, 0x01, 0x02, 0x00, 0x0b, +} + +// Bug #221 regression: invocations of the SAME function must succeed +// repeatedly. Before the fix, the second invocation hit +// `module[] has already been instantiated` because the executor +// passed the function name to wazero's WithName which globally +// registered it. +func TestEngine_RepeatedInvocations_SameFunction_NoCollision(t *testing.T) { + logger := zap.NewNop() + registry := NewMockRegistry() + hostServices := NewMockHostServices() + engine, err := NewEngine(nil, registry, hostServices, logger) + if err != nil { + t.Fatalf("NewEngine: %v", err) + } + defer engine.Close(context.Background()) + + fnDef := &FunctionDefinition{ + Name: "username-check", + Namespace: "anchat-test", + MemoryLimitMB: 64, + TimeoutSeconds: 5, + } + if _, err := registry.Register(context.Background(), fnDef, minimalWASM); err != nil { + t.Fatalf("Register: %v", err) + } + fn, err := registry.Get(context.Background(), "anchat-test", "username-check", 0) + if err != nil { + t.Fatalf("Get: %v", err) + } + + // Five sequential invocations — each must succeed. + for i := 0; i < 5; i++ { + if _, err := engine.Execute(context.Background(), fn, []byte("x"), nil); err != nil { + t.Fatalf("invocation %d failed: %v", i, err) + } + } +} + +// Bug #221 regression: two CONCURRENT invocations of the same function +// must both succeed. Before the fix this was THE most reliable repro — +// the second goroutine's InstantiateModule lost the race for the global +// name slot. +func TestEngine_ConcurrentInvocations_SameFunction_NoCollision(t *testing.T) { + logger := zap.NewNop() + registry := NewMockRegistry() + hostServices := NewMockHostServices() + engine, err := NewEngine(nil, registry, hostServices, logger) + if err != nil { + t.Fatalf("NewEngine: %v", err) + } + defer engine.Close(context.Background()) + + fnDef := &FunctionDefinition{ + Name: "user-create", + Namespace: "anchat-test", + MemoryLimitMB: 64, + TimeoutSeconds: 5, + } + if _, err := registry.Register(context.Background(), fnDef, minimalWASM); err != nil { + t.Fatalf("Register: %v", err) + } + fn, err := registry.Get(context.Background(), "anchat-test", "user-create", 0) + if err != nil { + t.Fatalf("Get: %v", err) + } + + const goroutines = 16 + const iterations = 10 + + var ( + wg sync.WaitGroup + errMu sync.Mutex + firstErr error + errPayloads []string + ) + for g := 0; g < goroutines; g++ { + wg.Add(1) + go func() { + defer wg.Done() + for i := 0; i < iterations; i++ { + if _, err := engine.Execute(context.Background(), fn, []byte("x"), nil); err != nil { + errMu.Lock() + if firstErr == nil { + firstErr = err + } + errPayloads = append(errPayloads, err.Error()) + errMu.Unlock() + return + } + } + }() + } + wg.Wait() + + if firstErr != nil { + // Surface ALL distinct error messages — easier to debug than the + // first one alone if the runtime starts failing weirdly. + t.Fatalf("concurrent invocations hit %d errors. first: %v\nall: %v", + len(errPayloads), firstErr, errPayloads) + } +} + +// Bug #221 regression: re-deploy of the same function (new version, new +// WASM CID) followed by an invocation must succeed. The old compiled +// module's cache slot must not block the new instance from running. +func TestEngine_ReDeploy_ThenInvoke_NoCollision(t *testing.T) { + logger := zap.NewNop() + registry := NewMockRegistry() + hostServices := NewMockHostServices() + engine, err := NewEngine(nil, registry, hostServices, logger) + if err != nil { + t.Fatalf("NewEngine: %v", err) + } + defer engine.Close(context.Background()) + + fnDef := &FunctionDefinition{ + Name: "wallet-link", + Namespace: "anchat-test", + MemoryLimitMB: 64, + TimeoutSeconds: 5, + } + + // First deploy + invoke. + if _, err := registry.Register(context.Background(), fnDef, minimalWASM); err != nil { + t.Fatalf("first Register: %v", err) + } + fn1, _ := registry.Get(context.Background(), "anchat-test", "wallet-link", 0) + if _, err := engine.Execute(context.Background(), fn1, []byte("x"), nil); err != nil { + t.Fatalf("first invoke: %v", err) + } + + // Re-deploy: same name, new bytes (still valid). MockRegistry overwrites + // the function record; the engine sees a fresh wasmCID. + if _, err := registry.Register(context.Background(), fnDef, minimalWASM); err != nil { + t.Fatalf("re-deploy: %v", err) + } + fn2, _ := registry.Get(context.Background(), "anchat-test", "wallet-link", 0) + + // Multiple invocations after re-deploy — exactly the AnChat repro. + for i := 0; i < 3; i++ { + _, err := engine.Execute(context.Background(), fn2, []byte("x"), nil) + if err != nil { + if strings.Contains(err.Error(), "already been instantiated") { + t.Fatalf("bug #221 regression: %v", err) + } + t.Fatalf("post-redeploy invoke %d failed: %v", i, err) + } + } +} diff --git a/core/pkg/serverless/execution/executor.go b/core/pkg/serverless/execution/executor.go index 50d487e..53c3db6 100644 --- a/core/pkg/serverless/execution/executor.go +++ b/core/pkg/serverless/execution/executor.go @@ -34,6 +34,20 @@ func NewExecutor(runtime wazero.Runtime, logger *zap.Logger, maxConcurrent int) // ExecuteModule instantiates and runs a WASM module with the given input. // The contextSetter callback is used to set invocation context on host services. +// +// Bug #221 fix: each invocation gets an ANONYMOUS module instance (no name +// in the wazero runtime registry). Previously we used the function name +// as the module name, which made wazero refuse the second instantiation +// with "module[] has already been instantiated" any time: +// +// - Two invocations of the same function ran concurrently +// - A re-deploy happened while a previous invocation was still mid-flight +// - An instance.Close() raced with the next instantiation +// +// Anonymous instances eliminate the collision class entirely. The +// function name is still surfaced via WithArgs(moduleName) so WASI +// `argv[0]` shows it inside the function (matches the prior behavior +// that user code may depend on). func (e *Executor) ExecuteModule(ctx context.Context, compiled wazero.CompiledModule, moduleName string, input []byte, contextSetter func(), contextClearer func()) ([]byte, error) { // Set invocation context for host functions if contextSetter != nil { @@ -48,9 +62,14 @@ func (e *Executor) ExecuteModule(ctx context.Context, compiled wazero.CompiledMo stdout := new(bytes.Buffer) stderr := new(bytes.Buffer) - // Create module configuration with WASI stdio + // Create module configuration with WASI stdio. + // + // WithName("") = anonymous instance, NOT registered under the runtime's + // global module name table. argv[0] still carries the function name for + // WASI-consuming code that reads os.Args[0] (TinyGo's runtime + apps + // that log it). See doc comment above for the bug-#221 rationale. moduleConfig := wazero.NewModuleConfig(). - WithName(moduleName). + WithName(""). WithStdin(stdin). WithStdout(stdout). WithStderr(stderr). diff --git a/core/pkg/serverless/hostfunctions/host_services.go b/core/pkg/serverless/hostfunctions/host_services.go index 3a4f843..160be73 100644 --- a/core/pkg/serverless/hostfunctions/host_services.go +++ b/core/pkg/serverless/hostfunctions/host_services.go @@ -16,10 +16,14 @@ import ( // NewHostFunctions creates a new HostFunctions instance. // -// pushDispatcher and wsBridge may be nil when those features aren't -// configured on this gateway — in that case PushSend silently no-ops -// (so functions stay portable) and WSPubSubBridge returns an explicit -// error (because absence of a requested bridge should be visible). +// pushDispatcher / pushManager / wsBridge may be nil when those features +// aren't configured on this gateway. PushSend prefers pushManager when +// present (per-namespace config takes effect), falls back to pushDispatcher, +// and silently no-ops when both are nil — so functions stay portable +// across deployments with/without push. +// +// WSPubSubBridge returns an explicit error when wsBridge is nil because +// absence of a requested bridge should be visible (callers asked for it). func NewHostFunctions( db rqlite.Client, cacheClient olriclib.Client, @@ -28,6 +32,7 @@ func NewHostFunctions( wsManager serverless.WebSocketManager, secrets serverless.SecretsManager, pushDispatcher *push.PushDispatcher, + pushManager *push.Manager, wsBridge *wsbridge.Bridge, cfg HostFunctionsConfig, logger *zap.Logger, @@ -46,6 +51,7 @@ func NewHostFunctions( wsManager: wsManager, secrets: secrets, pushDispatcher: pushDispatcher, + pushManager: pushManager, wsBridge: wsBridge, httpClient: tlsutil.NewHTTPClient(httpTimeout), logger: logger, diff --git a/core/pkg/serverless/hostfunctions/push.go b/core/pkg/serverless/hostfunctions/push.go index dfd2638..0f87617 100644 --- a/core/pkg/serverless/hostfunctions/push.go +++ b/core/pkg/serverless/hostfunctions/push.go @@ -33,10 +33,16 @@ const MaxPushSendArgsBytes = 16 * 1024 // namespace — the dispatcher reads the namespace from the invocation // context (set by the engine before invoking). // -// If push is not configured on this gateway (no dispatcher), this returns -// nil (silent no-op) so functions remain portable across environments. +// If push is not configured on this gateway (no dispatcher AND no +// manager), this returns nil (silent no-op) so functions remain portable +// across environments. +// +// When the manager is present (bug #220 follow-up), it routes through +// per-namespace config — tenants who have set their own ntfy / expo via +// PUT /v1/push/config get their providers; namespaces with no config +// fall back to the gateway YAML defaults via the manager's resolution. func (h *HostFunctions) PushSend(ctx context.Context, userID string, msgJSON []byte) error { - if h.pushDispatcher == nil { + if h.pushManager == nil && h.pushDispatcher == nil { // Silent no-op — push isn't configured on this gateway. return nil } @@ -96,8 +102,24 @@ func (h *HostFunctions) PushSend(ctx context.Context, userID string, msgJSON []b Data: args.Data, } - if err := h.pushDispatcher.SendToUser(ctx, namespace, userID, msg); err != nil { - return &serverless.HostFunctionError{Function: "push_send", Cause: err} + // Route through Manager when present so per-namespace push config + // (set via PUT /v1/push/config) takes effect. The Manager itself + // falls back to YAML defaults if the namespace hasn't set anything. + var sendErr error + if h.pushManager != nil { + sendErr = h.pushManager.SendToUser(ctx, namespace, userID, msg) + // ErrPushNotConfigured here would mean: no per-namespace config + // AND no YAML defaults. Treat as silent no-op for portability — + // same contract as "no dispatcher at all". Functions can't depend + // on push being available. + if sendErr != nil && sendErr.Error() == push.ErrPushNotConfigured.Error() { + return nil + } + } else { + sendErr = h.pushDispatcher.SendToUser(ctx, namespace, userID, msg) + } + if sendErr != nil { + return &serverless.HostFunctionError{Function: "push_send", Cause: sendErr} } return nil } diff --git a/core/pkg/serverless/hostfunctions/types.go b/core/pkg/serverless/hostfunctions/types.go index 12ccaa4..db51fab 100644 --- a/core/pkg/serverless/hostfunctions/types.go +++ b/core/pkg/serverless/hostfunctions/types.go @@ -34,9 +34,13 @@ type HostFunctions struct { httpClient *http.Client logger *zap.Logger - // pushDispatcher may be nil when push isn't configured for this gateway. - // In that case PushSend returns nil silently — see hostfunctions/push.go. + // pushDispatcher (legacy) and pushManager (per-namespace, bug #220 + // follow-up) provide push send-paths. When pushManager is set, PushSend + // uses it so per-namespace config takes effect; pushDispatcher is the + // fallback. Both nil = push not configured anywhere on this gateway, + // PushSend returns nil silently for portability. pushDispatcher *push.PushDispatcher + pushManager *push.Manager // wsBridge may be nil when the gateway doesn't run a bridge. In that // case WSPubSubBridge returns an error rather than silently no-oping diff --git a/core/pkg/serverless/invoke.go b/core/pkg/serverless/invoke.go index 85bdbac..49ce65f 100644 --- a/core/pkg/serverless/invoke.go +++ b/core/pkg/serverless/invoke.go @@ -5,6 +5,7 @@ import ( "encoding/json" "errors" "fmt" + "strings" "time" "github.com/google/uuid" @@ -420,26 +421,55 @@ func (i *Invoker) BatchInvoke(ctx context.Context, req *BatchInvokeRequest) (*Ba // ----------------------------------------------------------------------------- // CanInvoke checks if a caller is authorized to invoke a function. +// +// Authorization model: +// - Public functions (`is_public: true`): anyone can invoke, no auth needed. +// The auth middleware lets unauthenticated requests through to public +// paths. +// - Private functions: any caller that the auth middleware has already +// authenticated for THIS namespace can invoke. By the time we reach +// this function, the request has passed authMiddleware which validates +// EITHER a JWT-bearing token whose `namespace` claim matches the target +// namespace OR an API key resolved to the target namespace. So the +// non-empty `callerWallet` here is sufficient evidence of a verified +// in-namespace caller. +// +// History (bug #215 follow-up): the previous logic was a stub — +// +// return callerWallet == namespace || fn.CreatedBy == callerWallet, nil +// +// — which only allowed namespace-name-as-wallet (the API-key fallback) or +// the deploying wallet. Onboarding-style functions like `user-create`, +// where a brand-new wallet calls in to register, were rejected with 401 +// because the new wallet was neither the namespace string nor the +// deployer. They worked only as a side-effect of JWT verification +// silently failing pre-#215, falling through to API-key auth, and +// callerWallet collapsing to the namespace string. Once JWT verify was +// fixed, the underlying flaw surfaced. +// +// Fine-grained per-function ACLs (group membership, roles) are deferred +// until there's a concrete tenant requirement. Today, "private" means +// "authenticated in-namespace caller required" and that's enforced +// here + at authMiddleware. func (i *Invoker) CanInvoke(ctx context.Context, namespace, functionName string, callerWallet string) (bool, error) { fn, err := i.registry.Get(ctx, namespace, functionName, 0) if err != nil { return false, err } - // Public functions can be invoked by anyone + // Public functions can be invoked by anyone (auth middleware allows + // the request through without credentials). if fn.IsPublic { return true, nil } - // Non-public functions require the caller to be in the same namespace - // (simplified authorization - can be extended) - if callerWallet == "" { - return false, nil - } - - // For now, just check if caller wallet matches namespace - // In production, you'd check group membership, roles, etc. - return callerWallet == namespace || fn.CreatedBy == callerWallet, nil + // Private function: require an authenticated caller. The auth + // middleware has already verified the caller belongs to this + // namespace (either via JWT `namespace` claim or via API-key + // namespace lookup) before this function ever runs, so the only + // thing we need to confirm here is that the caller has SOME + // identity at all (i.e. the request wasn't anonymous). + return strings.TrimSpace(callerWallet) != "", nil } // GetFunctionInfo returns basic info about a function for invocation. diff --git a/core/pkg/serverless/invoke_can_invoke_test.go b/core/pkg/serverless/invoke_can_invoke_test.go new file mode 100644 index 0000000..9333937 --- /dev/null +++ b/core/pkg/serverless/invoke_can_invoke_test.go @@ -0,0 +1,138 @@ +package serverless + +import ( + "context" + "testing" + + "go.uber.org/zap" +) + +// canInvokeMockRegistry is a minimal FunctionRegistry surface for the +// CanInvoke tests. Only Get is exercised; everything else panics so +// accidental drift is loud. +type canInvokeMockRegistry struct { + FunctionRegistry // embedded interface — calling unimplemented methods panics + + fn *Function + err error +} + +func (m *canInvokeMockRegistry) Get(_ context.Context, _, _ string, _ int) (*Function, error) { + if m.err != nil { + return nil, m.err + } + return m.fn, nil +} + +func newInvokerForCanInvokeTest(fn *Function) *Invoker { + return &Invoker{ + registry: &canInvokeMockRegistry{fn: fn}, + logger: zap.NewNop(), + } +} + +// TestCanInvoke_publicAllowsAnyone — public functions bypass the +// authorization check entirely (auth middleware lets unauthenticated +// requests through to the handler too). +func TestCanInvoke_publicAllowsAnyone(t *testing.T) { + inv := newInvokerForCanInvokeTest(&Function{ + Namespace: "anchat-test", + Name: "username-check", + IsPublic: true, + }) + for _, wallet := range []string{"", "8oxu7UzzaSXc...", "anchat-test"} { + ok, err := inv.CanInvoke(context.Background(), "anchat-test", "username-check", wallet) + if err != nil { + t.Fatalf("wallet=%q: %v", wallet, err) + } + if !ok { + t.Errorf("wallet=%q: public function denied", wallet) + } + } +} + +// TestCanInvoke_privateRejectsAnonymous — empty callerWallet means the +// auth middleware didn't establish any identity; private functions reject. +func TestCanInvoke_privateRejectsAnonymous(t *testing.T) { + inv := newInvokerForCanInvokeTest(&Function{ + Namespace: "anchat-test", + Name: "user-create", + IsPublic: false, + }) + ok, _ := inv.CanInvoke(context.Background(), "anchat-test", "user-create", "") + if ok { + t.Error("anonymous caller should be denied for private function") + } + // Whitespace-only is still anonymous. + ok, _ = inv.CanInvoke(context.Background(), "anchat-test", "user-create", " ") + if ok { + t.Error("whitespace-only callerWallet should be denied") + } +} + +// TestCanInvoke_privateAllowsJWTAuthenticatedNewWallet is the regression +// guard for bug #215 follow-up. A brand-new wallet (typical signup flow, +// `user-create` style) calls a private function in a namespace it doesn't +// "own" yet. Must succeed: auth middleware already validated the JWT +// belongs to this namespace, the only role of CanInvoke is to confirm +// SOMEONE is authenticated. +// +// Pre-fix this returned false (the wallet wasn't equal to the namespace +// string and wasn't the deployer), which was the entire reason AnChat saw +// 401 unauthorized after the cluster_secret_path fix unblocked JWT +// verification. +func TestCanInvoke_privateAllowsJWTAuthenticatedNewWallet(t *testing.T) { + inv := newInvokerForCanInvokeTest(&Function{ + Namespace: "anchat-test", + Name: "user-create", + IsPublic: false, + CreatedBy: "0xdeployer-wallet-not-this-caller", + }) + const newUserWallet = "8oxu7UzzaSXcxZ9B3YuEqr3Qpmx7tgT9HzaA4NUGiand" + ok, err := inv.CanInvoke(context.Background(), "anchat-test", "user-create", newUserWallet) + if err != nil { + t.Fatalf("CanInvoke: %v", err) + } + if !ok { + t.Fatal("new user wallet should be allowed to invoke private function (auth middleware vouches for them)") + } +} + +// TestCanInvoke_privateAllowsAPIKeyCaller — API-key callers get +// callerWallet=namespace from the wallet resolver. They should still +// succeed; this preserves the pre-#215 working flow for tenants who +// only use API keys. +func TestCanInvoke_privateAllowsAPIKeyCaller(t *testing.T) { + inv := newInvokerForCanInvokeTest(&Function{ + Namespace: "anchat-test", + Name: "user-create", + IsPublic: false, + CreatedBy: "0xdeployer-wallet", + }) + ok, err := inv.CanInvoke(context.Background(), "anchat-test", "user-create", "anchat-test") + if err != nil { + t.Fatalf("CanInvoke: %v", err) + } + if !ok { + t.Error("API-key callers (callerWallet=namespace) must keep working") + } +} + +// TestCanInvoke_privateAllowsDeployer — the function deployer can invoke +// their own private function. Was true pre-fix and must remain true. +func TestCanInvoke_privateAllowsDeployer(t *testing.T) { + const deployer = "0xdeployer-wallet" + inv := newInvokerForCanInvokeTest(&Function{ + Namespace: "anchat-test", + Name: "user-create", + IsPublic: false, + CreatedBy: deployer, + }) + ok, err := inv.CanInvoke(context.Background(), "anchat-test", "user-create", deployer) + if err != nil { + t.Fatalf("CanInvoke: %v", err) + } + if !ok { + t.Error("deployer must always be allowed to invoke their own function") + } +} diff --git a/core/pkg/serverless/triggers/cron_parser_test.go b/core/pkg/serverless/triggers/cron_parser_test.go new file mode 100644 index 0000000..2c50b4e --- /dev/null +++ b/core/pkg/serverless/triggers/cron_parser_test.go @@ -0,0 +1,217 @@ +package triggers + +import ( + "strings" + "testing" + "time" +) + +// ---------------------------------------------------------------------------- +// ParseCron — accept paths +// ---------------------------------------------------------------------------- + +func TestParseCron_fiveField(t *testing.T) { + c, err := ParseCron("0 3 * * *") // every day at 03:00 UTC + if err != nil { + t.Fatalf("ParseCron: %v", err) + } + if c.hasSeconds { + t.Error("hasSeconds = true, want false for 5-field expression") + } + if !c.minutes.match(0) { + t.Error("minutes mask missing 0") + } + if c.minutes.match(1) { + t.Error("minutes mask should not match 1") + } + if !c.hours.match(3) { + t.Error("hours mask missing 3") + } +} + +func TestParseCron_sixField(t *testing.T) { + c, err := ParseCron("*/30 * * * * *") // every 30 seconds + if err != nil { + t.Fatalf("ParseCron: %v", err) + } + if !c.hasSeconds { + t.Error("hasSeconds = false, want true for 6-field expression") + } + if !c.seconds.match(0) || !c.seconds.match(30) { + t.Error("seconds mask should match 0 and 30") + } + if c.seconds.match(15) { + t.Error("seconds mask should NOT match 15 for */30") + } +} + +func TestParseCron_listsAndRanges(t *testing.T) { + c, err := ParseCron("0,15,30,45 9-17 * * 1-5") // every 15min during business hours, weekdays + if err != nil { + t.Fatalf("ParseCron: %v", err) + } + for _, m := range []int{0, 15, 30, 45} { + if !c.minutes.match(m) { + t.Errorf("minutes mask missing %d", m) + } + } + for _, h := range []int{9, 12, 17} { + if !c.hours.match(h) { + t.Errorf("hours mask missing %d", h) + } + } + if c.hours.match(8) || c.hours.match(18) { + t.Error("hours mask should not match outside 9-17") + } + for _, d := range []int{1, 2, 3, 4, 5} { + if !c.dow.match(d) { + t.Errorf("dow mask missing %d", d) + } + } + if c.dow.match(0) || c.dow.match(6) { + t.Error("dow mask should not match weekend (0 or 6)") + } +} + +func TestParseCron_sundayNormalisation(t *testing.T) { + // Cron permits 0 OR 7 for Sunday. Both should be normalised to 0. + c7, err := ParseCron("0 0 * * 7") + if err != nil { + t.Fatalf("ParseCron 7: %v", err) + } + if !c7.dow.match(0) { + t.Error("dow 7 should normalise to match 0 (Sunday)") + } + if c7.dow.match(7) { + t.Error("after normalisation, bit 7 should be cleared (clamped to 0..6)") + } +} + +// ---------------------------------------------------------------------------- +// ParseCron — reject paths +// ---------------------------------------------------------------------------- + +func TestParseCron_rejectMalformed(t *testing.T) { + cases := []struct { + expr string + reason string + }{ + {"", "empty"}, + {" ", "whitespace-only"}, + {"* * *", "too few fields"}, + {"* * * * * * *", "too many fields"}, + {"60 * * * *", "minute out of range"}, + {"* 24 * * *", "hour out of range"}, + {"* * 0 * *", "day-of-month under range"}, + {"* * 32 * *", "day-of-month over range"}, + {"* * * 13 *", "month over range"}, + {"* * * 0 *", "month under range"}, + {"* * * * 8", "day-of-week over range"}, + {"abc * * * *", "non-numeric"}, + {"5-2 * * * *", "inverted range"}, + {"*/0 * * * *", "zero step"}, + {"*/-1 * * * *", "negative step"}, + } + for _, tc := range cases { + t.Run(tc.reason, func(t *testing.T) { + if _, err := ParseCron(tc.expr); err == nil { + t.Errorf("ParseCron(%q) succeeded; expected error (%s)", tc.expr, tc.reason) + } + }) + } +} + +// ---------------------------------------------------------------------------- +// Next() — correctness +// ---------------------------------------------------------------------------- + +func TestNext_dailyAt03UTC(t *testing.T) { + c, _ := ParseCron("0 3 * * *") + now := time.Date(2025, 5, 7, 12, 30, 0, 0, time.UTC) + got, err := c.Next(now) + if err != nil { + t.Fatalf("Next: %v", err) + } + want := time.Date(2025, 5, 8, 3, 0, 0, 0, time.UTC) + if !got.Equal(want) { + t.Errorf("Next = %s, want %s", got, want) + } +} + +func TestNext_strictlyAfter(t *testing.T) { + // When `after` is exactly on a matching minute, Next must return the + // FOLLOWING match, not `after` itself. + c, _ := ParseCron("0 3 * * *") + now := time.Date(2025, 5, 7, 3, 0, 0, 0, time.UTC) // exactly 03:00 + got, _ := c.Next(now) + want := time.Date(2025, 5, 8, 3, 0, 0, 0, time.UTC) + if !got.Equal(want) { + t.Errorf("Next from exact-match should advance one day: got %s, want %s", got, want) + } +} + +func TestNext_secondsResolution(t *testing.T) { + c, _ := ParseCron("*/30 * * * * *") // every 30 sec + now := time.Date(2025, 5, 7, 12, 0, 5, 0, time.UTC) + got, _ := c.Next(now) + want := time.Date(2025, 5, 7, 12, 0, 30, 0, time.UTC) + if !got.Equal(want) { + t.Errorf("Next = %s, want %s", got, want) + } +} + +func TestNext_monthSkip(t *testing.T) { + // Day-of-month=29, only February — must skip to the next leap year. + c, _ := ParseCron("0 0 29 2 *") + now := time.Date(2025, 1, 1, 0, 0, 0, 0, time.UTC) + got, err := c.Next(now) + if err != nil { + t.Fatalf("Next: %v", err) + } + want := time.Date(2028, 2, 29, 0, 0, 0, 0, time.UTC) + if !got.Equal(want) { + t.Errorf("Next leap = %s, want %s", got, want) + } +} + +func TestNext_impossibleScheduleFails(t *testing.T) { + // February 30th never exists. + c, _ := ParseCron("0 0 30 2 *") + now := time.Date(2025, 1, 1, 0, 0, 0, 0, time.UTC) + _, err := c.Next(now) + if err == nil { + t.Fatal("Next should fail for impossible date combo (Feb 30)") + } + if !strings.Contains(err.Error(), "no match") { + t.Errorf("error = %v, want 'no match' substring", err) + } +} + +func TestNext_weekdayOnly(t *testing.T) { + c, _ := ParseCron("0 9 * * 1") // every Monday at 09:00 + // Wednesday May 7, 2025 + now := time.Date(2025, 5, 7, 12, 0, 0, 0, time.UTC) + got, _ := c.Next(now) + // Next Monday is May 12, 2025 + want := time.Date(2025, 5, 12, 9, 0, 0, 0, time.UTC) + if !got.Equal(want) { + t.Errorf("Next weekday = %s, want %s", got, want) + } + if got.Weekday() != time.Monday { + t.Errorf("expected Monday, got %s", got.Weekday()) + } +} + +// ---------------------------------------------------------------------------- +// fieldMatcher +// ---------------------------------------------------------------------------- + +func TestFieldMatcher_outOfRange(t *testing.T) { + var m fieldMatcher = 0xFFFF + if m.match(-1) { + t.Error("match(-1) should be false") + } + if m.match(64) { + t.Error("match(64) should be false (mask is uint64, bits 0..63 only)") + } +} diff --git a/core/pkg/serverless/triggers/cron_scheduler.go b/core/pkg/serverless/triggers/cron_scheduler.go index d0edbfa..6ab4e78 100644 --- a/core/pkg/serverless/triggers/cron_scheduler.go +++ b/core/pkg/serverless/triggers/cron_scheduler.go @@ -128,17 +128,38 @@ func (s *CronScheduler) dispatch(ctx context.Context, row CronDueRow, now time.T zap.String("expr", row.CronExpression), zap.Error(err)) // Push next_run_at far out so we don't keep looking at this row. - _ = s.store.MarkRun(ctx, row.TriggerID, now, now.Add(365*24*time.Hour), + // Best-effort; if the lease is lost (ErrAlreadyRan) another gateway + // already saw the same bad expression and is handling it. + _ = s.store.MarkRun(ctx, row.TriggerID, row.NextRunAt, now, now.Add(365*24*time.Hour), "error", "bad cron expression: "+err.Error()) return } next, err := parsed.Next(now) if err != nil { - _ = s.store.MarkRun(ctx, row.TriggerID, now, now.Add(365*24*time.Hour), + _ = s.store.MarkRun(ctx, row.TriggerID, row.NextRunAt, now, now.Add(365*24*time.Hour), "error", "no next match: "+err.Error()) return } + // Claim the lease BEFORE invoking. The compare-and-swap on next_run_at + // ensures only one gateway wins the race when multiple instances see + // the same trigger as due. Pre-emptive claim (advance to the new next + // + provisional "running" status) means we don't issue duplicate + // invokes; the loser bails here. We patch in the real status after the + // invoke completes via a follow-up unconditional UPDATE keyed on the + // new next_run_at — the loser can't accidentally clobber that because + // its expected-old next_run_at no longer matches anything. + if err := s.store.MarkRun(ctx, row.TriggerID, row.NextRunAt, now, next, "running", ""); err != nil { + if err == ErrAlreadyRan { + // Another gateway claimed this tick. Skip silently. + return + } + s.logger.Warn("cron scheduler: failed to claim lease", + zap.String("trigger_id", row.TriggerID), + zap.Error(err)) + return + } + req := &serverless.InvokeRequest{ Namespace: row.Namespace, FunctionName: row.FunctionName, @@ -160,8 +181,12 @@ func (s *CronScheduler) dispatch(ctx context.Context, row CronDueRow, now time.T status = "error" errMsg = resp.Error } - if err := s.store.MarkRun(ctx, row.TriggerID, now, next, status, errMsg); err != nil { - s.logger.Warn("cron scheduler: MarkRun failed", + // Patch the result over the provisional "running" row. We're CAS'd on + // the (now) post-claim next_run_at so a stale loser from another node + // can't overwrite — though by construction, only this gateway holds + // the lease at this point. + if err := s.store.MarkRun(ctx, row.TriggerID, next, now, next, status, errMsg); err != nil && err != ErrAlreadyRan { + s.logger.Warn("cron scheduler: failed to record run result", zap.String("trigger_id", row.TriggerID), zap.Error(err)) } diff --git a/core/pkg/serverless/triggers/cron_store.go b/core/pkg/serverless/triggers/cron_store.go index f62e7a9..77116c4 100644 --- a/core/pkg/serverless/triggers/cron_store.go +++ b/core/pkg/serverless/triggers/cron_store.go @@ -43,6 +43,12 @@ type cronRow struct { // CronDueRow is the JOINed row returned by ListDue: trigger + the function // metadata the scheduler needs to invoke it. +// +// NextRunAt is captured at ListDue time and passed back into MarkRun as the +// expected-old value for the compare-and-swap UPDATE. Two gateways racing +// the same trigger will both see the same NextRunAt; only the first MarkRun +// wins (1 row affected), the loser gets 0 rows affected and the scheduler +// skips the duplicate invoke. type CronDueRow struct { TriggerID string FunctionID string @@ -156,6 +162,11 @@ func (s *CronTriggerStore) ListDue(ctx context.Context, now time.Time, limit int if limit <= 0 { limit = 100 } + // `next_run_at IS NOT NULL` is defensive: the schema permits NULL and + // `Add` always populates it, but a manual DB poke or future migration + // could leave a NULL row that would otherwise sort first and behave + // unpredictably under `<=`. Guarding here keeps the scheduler's + // invariants explicit. query := ` SELECT t.id AS trigger_id, t.function_id AS function_id, f.name AS function_name, f.namespace AS namespace, @@ -164,6 +175,7 @@ func (s *CronTriggerStore) ListDue(ctx context.Context, now time.Time, limit int JOIN functions f ON t.function_id = f.id WHERE t.enabled = TRUE AND f.status = 'active' + AND t.next_run_at IS NOT NULL AND t.next_run_at <= ? ORDER BY t.next_run_at ASC LIMIT ? @@ -178,9 +190,20 @@ func (s *CronTriggerStore) ListDue(ctx context.Context, now time.Time, limit int // MarkRun updates next_run_at + last_run_at + last_status / last_error // after the scheduler invokes a trigger. Idempotent: if the row was // removed concurrently, the UPDATE is a no-op. +// +// expectedNextRunAt enables the soft-lease semantics promised by the +// scheduler comment: the UPDATE is gated on the row's current next_run_at +// matching what ListDue read. Two gateways racing the same tick will both +// see the same expectedNextRunAt; only one's UPDATE matches a row, the +// other gets RowsAffected==0 and ErrAlreadyRan back so the scheduler can +// skip the duplicate invoke. +// +// Returns ErrAlreadyRan when the compare-and-swap missed (another node won +// the race or the row was deleted). Other errors are wrapped DB errors. func (s *CronTriggerStore) MarkRun( ctx context.Context, triggerID string, + expectedNextRunAt time.Time, ranAt time.Time, nextRunAt time.Time, status string, @@ -189,8 +212,20 @@ func (s *CronTriggerStore) MarkRun( query := ` UPDATE function_cron_triggers SET last_run_at = ?, next_run_at = ?, last_status = ?, last_error = ? - WHERE id = ? + WHERE id = ? AND next_run_at = ? ` - _, err := s.db.Exec(ctx, query, ranAt, nextRunAt, status, errMsg, triggerID) - return err + res, err := s.db.Exec(ctx, query, ranAt, nextRunAt, status, errMsg, triggerID, expectedNextRunAt) + if err != nil { + return fmt.Errorf("failed to MarkRun cron trigger: %w", err) + } + if affected, _ := res.RowsAffected(); affected == 0 { + return ErrAlreadyRan + } + return nil } + +// ErrAlreadyRan is returned by MarkRun when the compare-and-swap on +// next_run_at missed — either another gateway claimed this tick first, or +// the trigger was deleted concurrently. The scheduler treats it as "skip +// the duplicate invoke result" rather than a hard error. +var ErrAlreadyRan = fmt.Errorf("cron trigger already advanced by another node") diff --git a/core/pkg/serverless/triggers/cron_store_markrun_test.go b/core/pkg/serverless/triggers/cron_store_markrun_test.go new file mode 100644 index 0000000..a465c5e --- /dev/null +++ b/core/pkg/serverless/triggers/cron_store_markrun_test.go @@ -0,0 +1,120 @@ +package triggers + +import ( + "context" + "database/sql" + "errors" + "testing" + "time" + + "github.com/DeBrosOfficial/network/pkg/rqlite" + "go.uber.org/zap" +) + +// markRunMockDB is a minimal rqlite.Client surface for MarkRun tests. +// Only Exec is exercised — anything else panics so accidental drift is loud. +type markRunMockDB struct { + rqlite.Client // embedded interface — calling any unimplemented method panics + + lastQuery string + lastArgs []any + rows int64 + execErr error +} + +func (m *markRunMockDB) Exec(_ context.Context, query string, args ...any) (sql.Result, error) { + m.lastQuery = query + m.lastArgs = args + if m.execErr != nil { + return nil, m.execErr + } + return &fakeResult{rows: m.rows}, nil +} + +type fakeResult struct { + rows int64 +} + +func (f *fakeResult) LastInsertId() (int64, error) { return 0, nil } +func (f *fakeResult) RowsAffected() (int64, error) { return f.rows, nil } + +// TestMarkRun_compareAndSwapMisses guards bug #65 audit B6: when two gateways +// race the same tick, only the first MarkRun wins and the second must return +// ErrAlreadyRan so the caller can skip the duplicate invoke. +func TestMarkRun_compareAndSwapMisses(t *testing.T) { + db := &markRunMockDB{rows: 0} // simulate "no row matched the WHERE" + store := NewCronTriggerStore(db, zap.NewNop()) + + expected := time.Date(2025, 5, 7, 12, 0, 0, 0, time.UTC) + err := store.MarkRun(context.Background(), + "trigger-id", expected, expected.Add(time.Second), + expected.Add(time.Minute), "ok", "") + if !errors.Is(err, ErrAlreadyRan) { + t.Fatalf("MarkRun = %v, want ErrAlreadyRan", err) + } +} + +// TestMarkRun_compareAndSwapWins covers the happy path: 1 row affected, no +// error returned. This pairs with the misses-test to fully cover the lease +// branch. +func TestMarkRun_compareAndSwapWins(t *testing.T) { + db := &markRunMockDB{rows: 1} + store := NewCronTriggerStore(db, zap.NewNop()) + + expected := time.Date(2025, 5, 7, 12, 0, 0, 0, time.UTC) + err := store.MarkRun(context.Background(), + "trigger-id", expected, expected.Add(time.Second), + expected.Add(time.Minute), "ok", "") + if err != nil { + t.Fatalf("MarkRun (winning CAS) = %v, want nil", err) + } +} + +// TestMarkRun_dbErrorPropagates ensures a real database failure is wrapped +// and returned, NOT collapsed into ErrAlreadyRan. +func TestMarkRun_dbErrorPropagates(t *testing.T) { + dbErr := errors.New("rqlite unavailable") + db := &markRunMockDB{execErr: dbErr} + store := NewCronTriggerStore(db, zap.NewNop()) + + err := store.MarkRun(context.Background(), + "trigger-id", time.Now(), time.Now(), time.Now(), "ok", "") + if err == nil { + t.Fatal("MarkRun with DB error returned nil") + } + if errors.Is(err, ErrAlreadyRan) { + t.Fatal("DB error should not surface as ErrAlreadyRan") + } + if !errors.Is(err, dbErr) { + t.Errorf("error chain missing wrapped DB error: %v", err) + } +} + +// TestMarkRun_compareAndSwapClauseInQuery verifies the SQL actually contains +// the `next_run_at = ?` predicate — without it the lease semantics are +// silently broken even though tests with a 1-row mock would still "pass". +func TestMarkRun_compareAndSwapClauseInQuery(t *testing.T) { + db := &markRunMockDB{rows: 1} + store := NewCronTriggerStore(db, zap.NewNop()) + + _ = store.MarkRun(context.Background(), "id", + time.Now(), time.Now(), time.Now(), "ok", "") + + if db.lastQuery == "" { + t.Fatal("Exec was not called") + } + // The exact SQL is whitespace-formatted; just check the lease clause is + // present. + if !contains(db.lastQuery, "next_run_at = ?") { + t.Errorf("MarkRun SQL missing CAS clause:\n%s", db.lastQuery) + } +} + +func contains(haystack, needle string) bool { + for i := 0; i+len(needle) <= len(haystack); i++ { + if haystack[i:i+len(needle)] == needle { + return true + } + } + return false +}