From f41242538ee75dfda4c6e435a54f5c814d5ab4d8 Mon Sep 17 00:00:00 2001 From: anonpenguin23 Date: Tue, 9 Jun 2026 13:01:02 +0300 Subject: [PATCH] feat(serverless): add raw http response mode and secrets encryption - Add `raw_http_response` configuration to functions to allow verbatim HTTP responses - Implement cluster-wide secrets encryption key generation and distribution for serverless functions - Update documentation with UnifiedPush support for ntfy on Android/GrapheneOS --- core/docs/PUSH_NOTIFICATIONS.md | 37 ++ core/migrations/029_raw_http_response.sql | 15 + core/pkg/cli/functions/helpers.go | 8 + core/pkg/cli/functions/helpers_test.go | 53 +++ .../cli/production/install/orchestrator.go | 8 + core/pkg/environments/production/config.go | 61 +++ .../environments/production/orchestrator.go | 6 + .../production/secrets_encryption_key_test.go | 80 ++++ core/pkg/environments/templates/node.yaml | 6 + core/pkg/environments/templates/render.go | 9 + .../pkg/environments/templates/render_test.go | 26 ++ core/pkg/gateway/config.go | 8 + core/pkg/gateway/dependencies.go | 12 +- core/pkg/gateway/handlers/join/handler.go | 11 + .../gateway/handlers/push/config_handler.go | 13 +- .../handlers/push/resolve_caller_test.go | 63 ++++ core/pkg/gateway/handlers/push/types.go | 20 +- .../handlers/serverless/invoke_handler.go | 50 +++ .../serverless/raw_http_headers_test.go | 31 ++ core/pkg/node/gateway.go | 10 + core/pkg/push/manager.go | 12 +- core/pkg/push/providers/ntfy/credentials.go | 25 +- .../push/providers/ntfy/credentials_test.go | 20 +- core/pkg/push/providers/ntfy/ntfy.go | 66 +++- core/pkg/push/providers/ntfy/ntfy_test.go | 103 +++++ core/pkg/push/url_guard.go | 193 ++++++++++ core/pkg/push/url_guard_test.go | 160 ++++++++ core/pkg/serverless/engine.go | 124 +++++- .../serverless/ephemeral_disconnect_test.go | 52 +++ core/pkg/serverless/ephemeral_state.go | 352 ++++++++++++++++++ core/pkg/serverless/ephemeral_state_test.go | 295 +++++++++++++++ core/pkg/serverless/execution/executor.go | 11 +- .../serverless/execution/randsource_test.go | 181 +++++++++ core/pkg/serverless/hostfuncs_test.go | 12 + .../serverless/hostfunctions/host_services.go | 27 +- core/pkg/serverless/hostfunctions/http.go | 12 + core/pkg/serverless/hostfunctions/pubsub.go | 34 ++ core/pkg/serverless/hostfunctions/secrets.go | 32 +- .../serverless/hostfunctions/secrets_test.go | 199 ++++++++++ core/pkg/serverless/hostfunctions/types.go | 7 + core/pkg/serverless/invoke.go | 9 + core/pkg/serverless/mocks_test.go | 12 + core/pkg/serverless/raw_http.go | 142 +++++++ core/pkg/serverless/raw_http_test.go | 129 +++++++ core/pkg/serverless/registry.go | 25 +- .../pkg/serverless/registry/function_store.go | 19 +- core/pkg/serverless/registry/types.go | 7 + core/pkg/serverless/registry_raw_http_test.go | 34 ++ .../dispatch_local_dedup_integration_test.go | 159 ++++++++ core/pkg/serverless/triggers/dispatcher.go | 65 +++- core/pkg/serverless/triggers/local_dedup.go | 108 ++++++ .../serverless/triggers/local_dedup_test.go | 140 +++++++ core/pkg/serverless/types.go | 40 ++ core/pkg/serverless/websocket.go | 30 ++ 54 files changed, 3310 insertions(+), 53 deletions(-) create mode 100644 core/migrations/029_raw_http_response.sql create mode 100644 core/pkg/cli/functions/helpers_test.go create mode 100644 core/pkg/environments/production/secrets_encryption_key_test.go create mode 100644 core/pkg/gateway/handlers/push/resolve_caller_test.go create mode 100644 core/pkg/gateway/handlers/serverless/raw_http_headers_test.go create mode 100644 core/pkg/push/url_guard.go create mode 100644 core/pkg/push/url_guard_test.go create mode 100644 core/pkg/serverless/ephemeral_disconnect_test.go create mode 100644 core/pkg/serverless/ephemeral_state.go create mode 100644 core/pkg/serverless/ephemeral_state_test.go create mode 100644 core/pkg/serverless/execution/randsource_test.go create mode 100644 core/pkg/serverless/hostfunctions/secrets_test.go create mode 100644 core/pkg/serverless/raw_http.go create mode 100644 core/pkg/serverless/raw_http_test.go create mode 100644 core/pkg/serverless/registry_raw_http_test.go create mode 100644 core/pkg/serverless/triggers/dispatch_local_dedup_integration_test.go create mode 100644 core/pkg/serverless/triggers/local_dedup.go create mode 100644 core/pkg/serverless/triggers/local_dedup_test.go diff --git a/core/docs/PUSH_NOTIFICATIONS.md b/core/docs/PUSH_NOTIFICATIONS.md index 8e22c39..54410e8 100644 --- a/core/docs/PUSH_NOTIFICATIONS.md +++ b/core/docs/PUSH_NOTIFICATIONS.md @@ -214,6 +214,43 @@ your client computes locally from `(namespace, userId, topic_secret)`. For `ntfy` with `topic_mode=path`, the token is `ns//`. +### UnifiedPush (Android / GrapheneOS, no Google Play Services) + +ntfy is a [UnifiedPush](https://unifiedpush.org) distributor, so Android +devices — including de-Googled **GrapheneOS** — can receive push **without +Firebase / Google Play Services**. The flow: + +1. The device runs a UnifiedPush **distributor** (the ntfy Android app, or an + embedded distributor library) pointed at your push host + (`https://push.`). +2. The app registers with the distributor and is handed an **endpoint URL**, + e.g. `https://push./upXXXXXXXX`. +3. Register that endpoint as a push device: + + ```http + POST /v1/push/devices + { + "device_id": "", + "provider": "ntfy", + "token": "https://push./upXXXXXXXX", // the full endpoint + "platform": "android" + } + ``` + +The gateway POSTs to the endpoint **verbatim** (per the UnifiedPush spec), so +you don't have to deconstruct it. As a safety measure the endpoint's +scheme+host **must match your configured ntfy push host** — a device token can +only ever publish to your own push server, never an arbitrary host. + +You may instead register just the bare **topic** (the endpoint's last path +segment) as the token — both forms work; use whichever your UnifiedPush library +makes convenient. + +**GrapheneOS notes:** works under both "No Google Play" and "Sandboxed Google +Play" profiles. The distributor holds the persistent connection (not your app), +so battery impact is the distributor's; high-priority messages +(`priority: "high"`) wake the app from Doze. + --- ## Step 6 — Send pushes diff --git a/core/migrations/029_raw_http_response.sql b/core/migrations/029_raw_http_response.sql new file mode 100644 index 0000000..1ee11e6 --- /dev/null +++ b/core/migrations/029_raw_http_response.sql @@ -0,0 +1,15 @@ +-- ============================================================================= +-- 029_raw_http_response.sql +-- +-- Raw-HTTP-response serverless function mode — bugboard #835. +-- +-- When raw_http_response is true, the function may call the set_http_response +-- host function to emit a verbatim HTTP response (status + headers + body) +-- instead of the JSON/Ack-wrapped output. This lets a namespace app proxy an +-- upstream RPC (Helius / Alchemy) transparently. See pkg/serverless/raw_http.go. +-- +-- Default false → backward compatible: existing functions keep returning the +-- JSON/Ack-wrapped output unchanged. +-- ============================================================================= + +ALTER TABLE functions ADD COLUMN raw_http_response BOOLEAN DEFAULT FALSE; diff --git a/core/pkg/cli/functions/helpers.go b/core/pkg/cli/functions/helpers.go index 41a2b79..b9ca945 100644 --- a/core/pkg/cli/functions/helpers.go +++ b/core/pkg/cli/functions/helpers.go @@ -32,6 +32,11 @@ type FunctionConfig struct { WSIdleTimeoutSec int `yaml:"ws_idle_timeout_sec"` WSMaxFrameBytes int `yaml:"ws_max_frame_bytes"` WSMaxInflightPerConn int `yaml:"ws_max_inflight_per_conn"` + + // RawHTTPResponse enables raw-HTTP-response mode (bugboard #835) — the + // function may call set_http_response to emit a verbatim HTTP response + // (status/headers/body) instead of the JSON/Ack-wrapped output. + RawHTTPResponse bool `yaml:"raw_http_response"` } // RetryConfig holds retry settings. @@ -226,6 +231,9 @@ func uploadWASMFunction(wasmPath string, cfg *FunctionConfig) (map[string]interf if cfg.WSMaxInflightPerConn > 0 { metaObj["ws_max_inflight_per_conn"] = cfg.WSMaxInflightPerConn } + if cfg.RawHTTPResponse { + metaObj["raw_http_response"] = true + } if len(metaObj) > 0 { metadata, _ := json.Marshal(metaObj) writer.WriteField("metadata", string(metadata)) diff --git a/core/pkg/cli/functions/helpers_test.go b/core/pkg/cli/functions/helpers_test.go new file mode 100644 index 0000000..9715c14 --- /dev/null +++ b/core/pkg/cli/functions/helpers_test.go @@ -0,0 +1,53 @@ +package functions + +import ( + "os" + "path/filepath" + "testing" +) + +// writeFunctionYAML writes a function.yaml into a fresh temp dir and returns it. +func writeFunctionYAML(t *testing.T, body string) string { + t.Helper() + dir := t.TempDir() + if err := os.WriteFile(filepath.Join(dir, "function.yaml"), []byte(body), 0o600); err != nil { + t.Fatalf("write function.yaml: %v", err) + } + return dir +} + +func TestLoadConfig_RawHTTPResponse_true(t *testing.T) { + dir := writeFunctionYAML(t, "name: rpc-proxy\nraw_http_response: true\n") + + cfg, err := LoadConfig(dir) + if err != nil { + t.Fatalf("LoadConfig: %v", err) + } + if !cfg.RawHTTPResponse { + t.Error("RawHTTPResponse = false, want true") + } +} + +func TestLoadConfig_RawHTTPResponse_defaultsFalse(t *testing.T) { + dir := writeFunctionYAML(t, "name: plain-fn\n") + + cfg, err := LoadConfig(dir) + if err != nil { + t.Fatalf("LoadConfig: %v", err) + } + if cfg.RawHTTPResponse { + t.Error("RawHTTPResponse = true, want false (omitted in yaml)") + } +} + +func TestLoadConfig_RawHTTPResponse_explicitFalse(t *testing.T) { + dir := writeFunctionYAML(t, "name: plain-fn\nraw_http_response: false\n") + + cfg, err := LoadConfig(dir) + if err != nil { + t.Fatalf("LoadConfig: %v", err) + } + if cfg.RawHTTPResponse { + t.Error("RawHTTPResponse = true, want false") + } +} diff --git a/core/pkg/cli/production/install/orchestrator.go b/core/pkg/cli/production/install/orchestrator.go index 58f0f0d..4a94be2 100644 --- a/core/pkg/cli/production/install/orchestrator.go +++ b/core/pkg/cli/production/install/orchestrator.go @@ -477,6 +477,14 @@ func (o *Orchestrator) saveSecretsFromJoinResponse(resp *joinhandlers.JoinRespon } } + // Write serverless secrets encryption key (bugboard #837) — identical on + // every node so namespace function secrets decrypt cluster-wide. + if resp.SecretsEncryptionKey != "" { + if err := os.WriteFile(filepath.Join(secretsDir, "secrets-encryption-key"), []byte(resp.SecretsEncryptionKey), 0600); err != nil { + return fmt.Errorf("failed to write secrets-encryption-key: %w", err) + } + } + // Write IPFS Cluster trusted peer IDs if len(resp.IPFSClusterPeerIDs) > 0 { content := strings.Join(resp.IPFSClusterPeerIDs, "\n") + "\n" diff --git a/core/pkg/environments/production/config.go b/core/pkg/environments/production/config.go index 2eaa530..085555c 100644 --- a/core/pkg/environments/production/config.go +++ b/core/pkg/environments/production/config.go @@ -200,6 +200,18 @@ func (cg *ConfigGenerator) GenerateNodeConfig(peerAddresses []string, vpsIP stri data.Environment = cg.Environment data.OperatorWallet = cg.OperatorWallet + // Serverless function secrets encryption key (bugboard #837). Read the + // persisted key (generated in Phase3 / received via join) so it is + // rendered into node.yaml under http_gateway. If the file is missing the + // key is left empty and omitted from the rendered config — get_secret then + // stays disabled until the operator provisions the key. We deliberately do + // NOT generate here: generation/distribution is owned by SecretGenerator + // and the join flow so every node in a cluster shares one key. + secretsKeyPath := filepath.Join(cg.oramaDir, "secrets", "secrets-encryption-key") + if keyBytes, err := os.ReadFile(secretsKeyPath); err == nil { + data.SecretsEncryptionKey = strings.TrimSpace(string(keyBytes)) + } + return templates.RenderNodeConfig(data) } @@ -471,6 +483,55 @@ func (sg *SecretGenerator) EnsureAPIKeyHMACSecret() (string, error) { return secret, nil } +// EnsureSecretsEncryptionKey gets or generates the AES-256 key used to +// encrypt serverless function secrets at rest (the function_secrets table). +// The key is a 32-byte random value stored as 64 hex characters. +// +// It MUST be identical on every namespace-gateway node in a cluster and +// stable across restarts — otherwise secrets encrypted by one process can't +// be decrypted by another (bugboard #837). Like api-key-hmac-secret, joining +// nodes receive this value through the join flow rather than generating their +// own; this method only generates on the genesis node (or returns the +// existing key if a joining node already wrote it to disk). +func (sg *SecretGenerator) EnsureSecretsEncryptionKey() (string, error) { + secretPath := filepath.Join(sg.oramaDir, "secrets", "secrets-encryption-key") + secretDir := filepath.Dir(secretPath) + + if err := os.MkdirAll(secretDir, 0700); err != nil { + return "", fmt.Errorf("failed to create secrets directory: %w", err) + } + if err := os.Chmod(secretDir, 0700); err != nil { + return "", fmt.Errorf("failed to set secrets directory permissions: %w", err) + } + + // Try to read existing key + if data, err := os.ReadFile(secretPath); err == nil { + key := strings.TrimSpace(string(data)) + if len(key) == 64 { + if err := ensureSecretFilePermissions(secretPath); err != nil { + return "", err + } + return key, nil + } + } + + // Generate new key (32 bytes = 64 hex chars) + keyBytes := make([]byte, 32) + if _, err := rand.Read(keyBytes); err != nil { + return "", fmt.Errorf("failed to generate secrets encryption key: %w", err) + } + key := hex.EncodeToString(keyBytes) + + if err := os.WriteFile(secretPath, []byte(key), 0600); err != nil { + return "", fmt.Errorf("failed to save secrets encryption key: %w", err) + } + if err := ensureSecretFilePermissions(secretPath); err != nil { + return "", err + } + + return key, nil +} + func ensureSecretFilePermissions(secretPath string) error { if err := os.Chmod(secretPath, 0600); err != nil { return fmt.Errorf("failed to set permissions on %s: %w", secretPath, err) diff --git a/core/pkg/environments/production/orchestrator.go b/core/pkg/environments/production/orchestrator.go index 4704ed1..42be586 100644 --- a/core/pkg/environments/production/orchestrator.go +++ b/core/pkg/environments/production/orchestrator.go @@ -593,6 +593,12 @@ func (ps *ProductionSetup) Phase3GenerateSecrets() error { } ps.logf(" ✓ API key HMAC secret ensured") + // Serverless function secrets encryption key (bugboard #837) + if _, err := ps.secretGenerator.EnsureSecretsEncryptionKey(); err != nil { + return fmt.Errorf("failed to ensure secrets encryption key: %w", err) + } + ps.logf(" ✓ Secrets encryption key ensured") + // Node identity (unified architecture) peerID, err := ps.secretGenerator.EnsureNodeIdentity() if err != nil { diff --git a/core/pkg/environments/production/secrets_encryption_key_test.go b/core/pkg/environments/production/secrets_encryption_key_test.go new file mode 100644 index 0000000..c4a49be --- /dev/null +++ b/core/pkg/environments/production/secrets_encryption_key_test.go @@ -0,0 +1,80 @@ +package production + +import ( + "encoding/hex" + "os" + "path/filepath" + "strings" + "testing" +) + +// TestEnsureSecretsEncryptionKey_generatesAndPersists verifies that a fresh +// oramaDir produces a valid 32-byte hex key written to disk. +func TestEnsureSecretsEncryptionKey_generatesAndPersists(t *testing.T) { + dir := t.TempDir() + sg := NewSecretGenerator(dir) + + key, err := sg.EnsureSecretsEncryptionKey() + if err != nil { + t.Fatalf("EnsureSecretsEncryptionKey failed: %v", err) + } + if len(key) != 64 { + t.Fatalf("expected 64 hex chars, got %d (%q)", len(key), key) + } + raw, err := hex.DecodeString(key) + if err != nil || len(raw) != 32 { + t.Fatalf("key is not 32 bytes hex: err=%v len=%d", err, len(raw)) + } + + // Persisted to the expected path. + data, err := os.ReadFile(filepath.Join(dir, "secrets", "secrets-encryption-key")) + if err != nil { + t.Fatalf("reading persisted key failed: %v", err) + } + if strings.TrimSpace(string(data)) != key { + t.Errorf("persisted key %q != returned key %q", strings.TrimSpace(string(data)), key) + } +} + +// TestEnsureSecretsEncryptionKey_idempotent verifies the key is stable across +// calls — this is the property that makes secrets survive restarts and stay +// identical across cluster nodes (bugboard #837). +func TestEnsureSecretsEncryptionKey_idempotent(t *testing.T) { + dir := t.TempDir() + sg := NewSecretGenerator(dir) + + first, err := sg.EnsureSecretsEncryptionKey() + if err != nil { + t.Fatalf("first call failed: %v", err) + } + second, err := sg.EnsureSecretsEncryptionKey() + if err != nil { + t.Fatalf("second call failed: %v", err) + } + if first != second { + t.Errorf("key changed between calls: %q != %q", first, second) + } +} + +// TestEnsureSecretsEncryptionKey_regeneratesInvalid verifies a corrupt/empty +// on-disk key (wrong length) is replaced with a fresh valid one. +func TestEnsureSecretsEncryptionKey_regeneratesInvalid(t *testing.T) { + dir := t.TempDir() + secretsDir := filepath.Join(dir, "secrets") + if err := os.MkdirAll(secretsDir, 0700); err != nil { + t.Fatalf("mkdir failed: %v", err) + } + keyPath := filepath.Join(secretsDir, "secrets-encryption-key") + if err := os.WriteFile(keyPath, []byte("too-short"), 0600); err != nil { + t.Fatalf("write failed: %v", err) + } + + sg := NewSecretGenerator(dir) + key, err := sg.EnsureSecretsEncryptionKey() + if err != nil { + t.Fatalf("EnsureSecretsEncryptionKey failed: %v", err) + } + if len(key) != 64 { + t.Errorf("expected regenerated 64-char key, got %d (%q)", len(key), key) + } +} diff --git a/core/pkg/environments/templates/node.yaml b/core/pkg/environments/templates/node.yaml index 8559e0f..4cb1710 100644 --- a/core/pkg/environments/templates/node.yaml +++ b/core/pkg/environments/templates/node.yaml @@ -88,6 +88,12 @@ http_gateway: ipfs_cluster_api_url: "http://localhost:{{.ClusterAPIPort}}" ipfs_api_url: "http://localhost:{{.IPFSAPIPort}}" ipfs_timeout: "60s" +{{- if .SecretsEncryptionKey}} + # Serverless function secrets encryption key (AES-256, hex). Must be + # identical on every namespace-gateway node and stable across restarts + # (bugboard #837). Sourced from ~/.orama/secrets/secrets-encryption-key. + secrets_encryption_key: "{{.SecretsEncryptionKey}}" +{{- end}} # Routes for internal service reverse proxy (kept for backwards compatibility but not used by full gateway) routes: {} diff --git a/core/pkg/environments/templates/render.go b/core/pkg/environments/templates/render.go index 135085e..8258d26 100644 --- a/core/pkg/environments/templates/render.go +++ b/core/pkg/environments/templates/render.go @@ -46,6 +46,15 @@ type NodeConfigData struct { SSHUser string // SSH user for remote management Environment string // Environment name (devnet, testnet, etc.) OperatorWallet string // Operator wallet address + + // SecretsEncryptionKey is the AES-256 key (hex, 64 chars) used to encrypt + // serverless function secrets at rest. Rendered under http_gateway in + // node.yaml. Sourced from ~/.orama/secrets/secrets-encryption-key — must + // be identical across all namespace-gateway nodes in a cluster and stable + // across restarts (bugboard #837). Empty → key omitted from the rendered + // config (the gateway then reads the secret file directly / get_secret + // stays disabled until the key is configured). + SecretsEncryptionKey string } // GatewayConfigData holds parameters for gateway.yaml rendering diff --git a/core/pkg/environments/templates/render_test.go b/core/pkg/environments/templates/render_test.go index 8b84b58..ebf40ba 100644 --- a/core/pkg/environments/templates/render_test.go +++ b/core/pkg/environments/templates/render_test.go @@ -41,6 +41,32 @@ func TestRenderNodeConfig(t *testing.T) { } } +func TestRenderNodeConfig_secretsEncryptionKey(t *testing.T) { + const key = "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef" + + // Happy path: key present → rendered under http_gateway. + withKey, err := RenderNodeConfig(NodeConfigData{ + NodeID: "node1", + SecretsEncryptionKey: key, + }) + if err != nil { + t.Fatalf("RenderNodeConfig failed: %v", err) + } + want := "secrets_encryption_key: \"" + key + "\"" + if !strings.Contains(withKey, want) { + t.Errorf("rendered node config missing secrets key line %q\n---\n%s", want, withKey) + } + + // Edge case: empty key → line omitted entirely (no empty value rendered). + withoutKey, err := RenderNodeConfig(NodeConfigData{NodeID: "node1"}) + if err != nil { + t.Fatalf("RenderNodeConfig failed: %v", err) + } + if strings.Contains(withoutKey, "secrets_encryption_key") { + t.Errorf("empty key should omit secrets_encryption_key line, got:\n%s", withoutKey) + } +} + func TestRenderGatewayConfig(t *testing.T) { bootstrapMultiaddr := "/ip4/127.0.0.1/tcp/4001/p2p/Qm1234567890" data := GatewayConfigData{ diff --git a/core/pkg/gateway/config.go b/core/pkg/gateway/config.go index 0179b40..a646e1a 100644 --- a/core/pkg/gateway/config.go +++ b/core/pkg/gateway/config.go @@ -51,6 +51,14 @@ type Config struct { // Loaded from ~/.orama/secrets/api-key-hmac-secret. APIKeyHMACSecret string + // SecretsEncryptionKey is the AES-256 key (32 bytes, hex-encoded → 64 + // hex chars) used to encrypt serverless function secrets at rest in the + // function_secrets table. It MUST be identical on every namespace-gateway + // node in a cluster and stable across restarts — otherwise secrets + // encrypted by one process cannot be decrypted by another (bugboard #837). + // Loaded from ~/.orama/secrets/secrets-encryption-key. + SecretsEncryptionKey string + // WebRTC configuration (set when namespace has WebRTC enabled). // // WebRTCEnabled is RETAINED for back-compat with operator YAML and diff --git a/core/pkg/gateway/dependencies.go b/core/pkg/gateway/dependencies.go index 8c490f8..2f03089 100644 --- a/core/pkg/gateway/dependencies.go +++ b/core/pkg/gateway/dependencies.go @@ -469,9 +469,17 @@ func initializeServerless(logger *logging.ColoredLogger, cfg *Config, deps *Depe engineCfg.MaxTimeoutSeconds = 60 engineCfg.ModuleCacheSize = 100 - // Create secrets manager for serverless functions (AES-256-GCM encrypted) + // Create secrets manager for serverless functions (AES-256-GCM encrypted). + // + // The encryption key comes from the gateway Config (loaded from + // ~/.orama/secrets/secrets-encryption-key), NOT from engineCfg — engineCfg + // never has the key set, so passing it always produced a per-process + // ephemeral key and made get_secret return undecryptable values + // (bugboard #837). allowEphemeral=false: a missing/invalid key fails + // loudly here and disables get_secret rather than silently corrupting + // secrets. var secretsMgr serverless.SecretsManager - if smImpl, secretsErr := hostfunctions.NewDBSecretsManager(deps.ORMClient, engineCfg.SecretsEncryptionKey, logger.Logger); secretsErr != nil { + if smImpl, secretsErr := hostfunctions.NewDBSecretsManager(deps.ORMClient, cfg.SecretsEncryptionKey, false, logger.Logger); secretsErr != nil { logger.ComponentWarn(logging.ComponentGeneral, "Failed to initialize secrets manager; get_secret will be unavailable", zap.Error(secretsErr)) } else { diff --git a/core/pkg/gateway/handlers/join/handler.go b/core/pkg/gateway/handlers/join/handler.go index dd79485..b4f3d76 100644 --- a/core/pkg/gateway/handlers/join/handler.go +++ b/core/pkg/gateway/handlers/join/handler.go @@ -39,6 +39,9 @@ type JoinResponse struct { APIKeyHMACSecret string `json:"api_key_hmac_secret,omitempty"` RQLitePassword string `json:"rqlite_password,omitempty"` OlricEncryptionKey string `json:"olric_encryption_key,omitempty"` + // Serverless secrets encryption key (bugboard #837) — must be identical on + // every node so namespace function secrets decrypt cluster-wide. + SecretsEncryptionKey string `json:"secrets_encryption_key,omitempty"` // Cluster join info (all using WG IPs) RQLiteJoinAddress string `json:"rqlite_join_address"` @@ -200,6 +203,13 @@ func (h *Handler) HandleJoin(w http.ResponseWriter, r *http.Request) { olricEncryptionKey = strings.TrimSpace(string(data)) } + // Read serverless secrets encryption key (optional — may not exist on + // older clusters; bugboard #837) + secretsEncryptionKey := "" + if data, err := os.ReadFile(h.oramaDir + "/secrets/secrets-encryption-key"); err == nil { + secretsEncryptionKey = strings.TrimSpace(string(data)) + } + // 7. Get this node's WG IP (needed before peer list to check self-inclusion) myWGIP, err := h.getMyWGIP() if err != nil { @@ -271,6 +281,7 @@ func (h *Handler) HandleJoin(w http.ResponseWriter, r *http.Request) { APIKeyHMACSecret: apiKeyHMACSecret, RQLitePassword: rqlitePassword, OlricEncryptionKey: olricEncryptionKey, + SecretsEncryptionKey: secretsEncryptionKey, RQLiteJoinAddress: fmt.Sprintf("%s:7001", myWGIP), IPFSPeer: ipfsPeer, IPFSClusterPeer: ipfsClusterPeer, diff --git a/core/pkg/gateway/handlers/push/config_handler.go b/core/pkg/gateway/handlers/push/config_handler.go index fc23461..a3dacd5 100644 --- a/core/pkg/gateway/handlers/push/config_handler.go +++ b/core/pkg/gateway/handlers/push/config_handler.go @@ -17,7 +17,6 @@ import ( "encoding/json" "errors" "net/http" - "strings" "time" "github.com/DeBrosOfficial/network/pkg/push" @@ -136,13 +135,13 @@ func (h *Handlers) PutConfigHandler(w http.ResponseWriter, r *http.Request) { return } - // Validate URL fields look reasonable. We don't do hostname resolution - // here (slow, flaky); just reject obviously-wrong schemes. + // Reject a base URL that targets an internal/reserved host — a tenant must + // not be able to turn the gateway's push sender into an SSRF proxy (cloud + // metadata, WireGuard mesh, loopback). This is the config-SET path, so the + // DNS-resolving check is fine here; the hot send path never runs it. if body.NtfyBaseURL != nil && *body.NtfyBaseURL != "" { - if !strings.HasPrefix(*body.NtfyBaseURL, "http://") && - !strings.HasPrefix(*body.NtfyBaseURL, "https://") { - writeError(w, http.StatusBadRequest, - "ntfy_base_url must start with http:// or https://") + if err := push.CheckBaseURLResolvable(r.Context(), *body.NtfyBaseURL); err != nil { + writeError(w, http.StatusBadRequest, "ntfy_base_url rejected: "+err.Error()) return } } diff --git a/core/pkg/gateway/handlers/push/resolve_caller_test.go b/core/pkg/gateway/handlers/push/resolve_caller_test.go new file mode 100644 index 0000000..548b61a --- /dev/null +++ b/core/pkg/gateway/handlers/push/resolve_caller_test.go @@ -0,0 +1,63 @@ +package push + +import ( + "context" + "net/http" + "net/http/httptest" + "testing" + + authsvc "github.com/DeBrosOfficial/network/pkg/gateway/auth" + "github.com/DeBrosOfficial/network/pkg/gateway/ctxkeys" +) + +// Bugboard #548 — a push device must be keyed on the stable identity (rootId) +// when the app provides one, not the wallet credential that authenticated the +// session. resolveCallerUserID prefers the `root_id` custom claim and falls +// back to the JWT subject so single-credential apps keep working. + +func reqWithClaims(t *testing.T, claims *authsvc.JWTClaims) *http.Request { + t.Helper() + r := httptest.NewRequest(http.MethodGet, "/", nil) + ctx := r.Context() + if claims != nil { + ctx = context.WithValue(ctx, ctxkeys.JWT, claims) + } + return r.WithContext(ctx) +} + +func TestResolveCallerUserID_prefersRootIDClaim(t *testing.T) { + r := reqWithClaims(t, &authsvc.JWTClaims{ + Sub: "0xWALLET", + Custom: map[string]string{rootIDClaim: "root-uuid-123"}, + }) + if got := resolveCallerUserID(r); got != "root-uuid-123" { + t.Errorf("want rootId from claim, got %q", got) + } +} + +func TestResolveCallerUserID_fallsBackToSubject(t *testing.T) { + // No custom claim → wallet subject (back-compat for single-credential apps). + r := reqWithClaims(t, &authsvc.JWTClaims{Sub: "0xWALLET"}) + if got := resolveCallerUserID(r); got != "0xWALLET" { + t.Errorf("want wallet subject fallback, got %q", got) + } +} + +func TestResolveCallerUserID_emptyRootIDFallsBack(t *testing.T) { + // An empty root_id must not collapse identity to "" — fall back to subject. + r := reqWithClaims(t, &authsvc.JWTClaims{ + Sub: "0xWALLET", + Custom: map[string]string{rootIDClaim: ""}, + }) + if got := resolveCallerUserID(r); got != "0xWALLET" { + t.Errorf("want wallet fallback on empty root_id, got %q", got) + } +} + +func TestResolveCallerUserID_noJWTReturnsEmpty(t *testing.T) { + // API-key-only request (no JWT in context) → empty. + r := reqWithClaims(t, nil) + if got := resolveCallerUserID(r); got != "" { + t.Errorf("want empty for API-key-only request, got %q", got) + } +} diff --git a/core/pkg/gateway/handlers/push/types.go b/core/pkg/gateway/handlers/push/types.go index 5fe2af0..9e2b568 100644 --- a/core/pkg/gateway/handlers/push/types.go +++ b/core/pkg/gateway/handlers/push/types.go @@ -141,11 +141,27 @@ func resolveNamespace(r *http.Request) string { return "" } -// resolveCallerUserID extracts the JWT subject (typically the wallet) of -// the caller, or empty if the request was authenticated by API key only. +// rootIDClaim is the custom JWT claim an app may set to carry the stable +// identity (rootId) that a device should be keyed on, independent of which +// wallet credential authenticated the session. See bugboard #548. +const rootIDClaim = "root_id" + +// resolveCallerUserID extracts the identity a push device should be keyed on. +// +// In a multi-credential app (anchat), the JWT subject is the *wallet* — a +// credential, not the identity. A single user (rootId) with N linked wallets +// would otherwise register N device rows and receive N duplicate pushes +// (bugboard #548). When the app includes a stable `root_id` custom claim, we +// key on that; otherwise we fall back to the subject (wallet) so single- +// credential apps and older tokens keep working unchanged. +// +// Returns empty if the request was authenticated by API key only (no JWT). func resolveCallerUserID(r *http.Request) string { if v := r.Context().Value(ctxkeys.JWT); v != nil { if claims, ok := v.(*auth.JWTClaims); ok && claims != nil { + if rootID, ok := claims.Custom[rootIDClaim]; ok && rootID != "" { + return rootID + } return claims.Sub } } diff --git a/core/pkg/gateway/handlers/serverless/invoke_handler.go b/core/pkg/gateway/handlers/serverless/invoke_handler.go index f405072..2b96001 100644 --- a/core/pkg/gateway/handlers/serverless/invoke_handler.go +++ b/core/pkg/gateway/handlers/serverless/invoke_handler.go @@ -145,6 +145,27 @@ func (h *ServerlessHandlers) InvokeFunction(w http.ResponseWriter, r *http.Reque w.Header().Set("X-Request-ID", resp.RequestID) w.Header().Set("X-Duration-Ms", strconv.FormatInt(resp.DurationMS, 10)) + // Raw-HTTP-response mode (bugboard #835): when a function deployed with + // raw_http_response actually set a response via set_http_response, replay + // it verbatim (status + headers + body) and skip the sniff/wrap path. If + // the function set nothing, RawHTTP is nil and we fall through to the + // normal behavior unchanged. + if resp.RawHTTP != nil { + for k, v := range resp.RawHTTP.Headers { + // A tenant function must not overwrite gateway-owned trace/auth + // headers or framing-control (hop-by-hop) headers via its raw + // response — that would let it forge request IDs, leak/spoof + // internal-auth headers, or corrupt response framing. + if isReservedResponseHeader(k) { + continue + } + w.Header().Set(k, v) + } + w.WriteHeader(resp.RawHTTP.Status) + w.Write(resp.RawHTTP.Body) + return + } + // Try to detect if output is JSON if len(resp.Output) > 0 && (resp.Output[0] == '{' || resp.Output[0] == '[') { w.Header().Set("Content-Type", "application/json") @@ -256,3 +277,32 @@ func (h *ServerlessHandlers) ListVersions(w http.ResponseWriter, r *http.Request "count": len(versions), }) } + +// reservedResponseHeaders are response headers a raw-HTTP-response tenant +// function (bugboard #835) must not be able to set or overwrite: gateway-owned +// trace/auth headers and hop-by-hop / framing-control headers. Compared +// case-insensitively; the X-Internal- prefix is matched separately. +var reservedResponseHeaders = map[string]struct{}{ + "x-request-id": {}, + "x-duration-ms": {}, + "content-length": {}, + "transfer-encoding": {}, + "connection": {}, + "keep-alive": {}, + "proxy-authenticate": {}, + "proxy-authorization": {}, + "te": {}, + "trailer": {}, + "upgrade": {}, +} + +// isReservedResponseHeader reports whether a tenant-supplied response header key +// is reserved for the gateway and must be ignored in raw-HTTP-response mode. +func isReservedResponseHeader(key string) bool { + k := strings.ToLower(strings.TrimSpace(key)) + if _, ok := reservedResponseHeaders[k]; ok { + return true + } + // Any internal-auth header the gateway uses for inter-service trust. + return strings.HasPrefix(k, "x-internal-") +} diff --git a/core/pkg/gateway/handlers/serverless/raw_http_headers_test.go b/core/pkg/gateway/handlers/serverless/raw_http_headers_test.go new file mode 100644 index 0000000..b7ba382 --- /dev/null +++ b/core/pkg/gateway/handlers/serverless/raw_http_headers_test.go @@ -0,0 +1,31 @@ +package serverless + +import "testing" + +// Bugboard #835 hardening (flagged by code + security review): a raw-HTTP +// tenant function must not be able to set/overwrite gateway-owned trace/auth +// headers or hop-by-hop framing headers. + +func TestIsReservedResponseHeader(t *testing.T) { + reserved := []string{ + "X-Request-ID", "x-request-id", "X-Duration-Ms", + "Content-Length", "Transfer-Encoding", "Connection", "Keep-Alive", + "Proxy-Authenticate", "Proxy-Authorization", "TE", "Trailer", "Upgrade", + "X-Internal-Auth", "x-internal-anything", " X-Request-Id ", + } + for _, h := range reserved { + if !isReservedResponseHeader(h) { + t.Errorf("isReservedResponseHeader(%q) = false; want true (must be protected)", h) + } + } + + allowed := []string{ + "Content-Type", "Cache-Control", "X-Custom", "ETag", + "Access-Control-Allow-Origin", "Location", "Retry-After", + } + for _, h := range allowed { + if isReservedResponseHeader(h) { + t.Errorf("isReservedResponseHeader(%q) = true; want false (tenant may set it)", h) + } + } +} diff --git a/core/pkg/node/gateway.go b/core/pkg/node/gateway.go index 8bdd97d..2615674 100644 --- a/core/pkg/node/gateway.go +++ b/core/pkg/node/gateway.go @@ -58,6 +58,15 @@ func (n *Node) startHTTPGateway(ctx context.Context) error { rqlitePassword = strings.TrimSpace(string(secretBytes)) } + // Read the serverless secrets encryption key (bugboard #837). Must be the + // SAME value on every namespace-gateway node so a secret encrypted by one + // process decrypts on another; an empty value makes get_secret fail loudly + // (the manager refuses an ephemeral key in production). + secretsEncryptionKey := "" + if secretBytes, err := os.ReadFile(filepath.Join(oramaDir, "secrets", "secrets-encryption-key")); err == nil { + secretsEncryptionKey = strings.TrimSpace(string(secretBytes)) + } + gwCfg := &gateway.Config{ ListenAddr: n.config.HTTPGateway.ListenAddr, ClientNamespace: n.config.HTTPGateway.ClientNamespace, @@ -75,6 +84,7 @@ func (n *Node) startHTTPGateway(ctx context.Context) error { RQLitePassword: rqlitePassword, ClusterSecret: clusterSecret, APIKeyHMACSecret: apiKeyHMACSecret, + SecretsEncryptionKey: secretsEncryptionKey, WebRTCEnabled: n.config.HTTPGateway.WebRTC.Enabled, SFUPort: n.config.HTTPGateway.WebRTC.SFUPort, TURNDomain: n.config.HTTPGateway.WebRTC.TURNDomain, diff --git a/core/pkg/push/manager.go b/core/pkg/push/manager.go index 3dd8808..325ed12 100644 --- a/core/pkg/push/manager.go +++ b/core/pkg/push/manager.go @@ -296,7 +296,17 @@ func (m *Manager) buildDispatcher(ctx context.Context, namespace string) (*PushD // (DELETE) — there's no "set this field to empty to clear" // half-state, by design. if nc.NtfyBaseURL != "" { - eff.NtfyBaseURL = nc.NtfyBaseURL + // Defense-in-depth: a base URL stored before the SSRF guard + // existed (or via any path that skipped it) must not point at an + // internal/reserved literal IP. Drop the override and fall back + // to the gateway default if it does. Literal-only (no DNS, no + // syntax re-validation) so this stays safe on the hot build path. + if IsInternalBaseURL(nc.NtfyBaseURL) { + m.logger.Warn("push: ignoring namespace ntfy_base_url override (internal address)", + zap.String("namespace", namespace), zap.String("base_url", nc.NtfyBaseURL)) + } else { + eff.NtfyBaseURL = nc.NtfyBaseURL + } } if nc.NtfyAuthToken != "" { eff.NtfyAuthToken = nc.NtfyAuthToken diff --git a/core/pkg/push/providers/ntfy/credentials.go b/core/pkg/push/providers/ntfy/credentials.go index 64e7e74..f2a39ea 100644 --- a/core/pkg/push/providers/ntfy/credentials.go +++ b/core/pkg/push/providers/ntfy/credentials.go @@ -16,10 +16,11 @@ package ntfy // migration window, with the new credentials store taking precedence. import ( + "context" "encoding/json" "fmt" - "strings" + "github.com/DeBrosOfficial/network/pkg/push" "github.com/DeBrosOfficial/network/pkg/push/credentials" ) @@ -87,7 +88,17 @@ func (Validator) Validate(raw []byte) error { if err := json.Unmarshal(raw, &c); err != nil { return fmt.Errorf("ntfy credentials: invalid JSON: %w", err) } - return validateCredentials(c) + if err := validateCredentials(c); err != nil { + return err + } + // Validate is the config-SET path (the hot build path uses ParseCredentials, + // which skips DNS), so the resolving SSRF check is safe here: reject a + // base_url whose host resolves to an internal/reserved address. Fail-open on + // resolution error — see push.CheckBaseURLResolvable. + if err := push.CheckBaseURLResolvable(context.Background(), c.BaseURL); err != nil { + return fmt.Errorf("ntfy credentials: %w", err) + } + return nil } // Redact returns a JSON-safe view that never echoes the auth token or @@ -127,10 +138,12 @@ func ParseCredentials(raw []byte) (Credentials, error) { // validateCredentials is the shared validator used by both Validate and // ParseCredentials. func validateCredentials(c Credentials) error { - if c.BaseURL != "" { - if !strings.HasPrefix(c.BaseURL, "http://") && !strings.HasPrefix(c.BaseURL, "https://") { - return fmt.Errorf("ntfy credentials: base_url must start with http:// or https:// (got %q)", c.BaseURL) - } + // Literal-IP SSRF guard + scheme check. Runs on BOTH the set and the hot + // build path (no DNS), so a stored internal-literal base_url is also + // rejected when the dispatcher is (re)built. The DNS-resolving check lives + // in Validate (set path only). + if err := push.CheckBaseURLSyntax(c.BaseURL); err != nil { + return fmt.Errorf("ntfy credentials: %w", err) } if c.TopicMode != "" { switch c.TopicMode { diff --git a/core/pkg/push/providers/ntfy/credentials_test.go b/core/pkg/push/providers/ntfy/credentials_test.go index 5dfc1c6..431234e 100644 --- a/core/pkg/push/providers/ntfy/credentials_test.go +++ b/core/pkg/push/providers/ntfy/credentials_test.go @@ -26,7 +26,10 @@ func TestValidator_RejectsBadBaseURL(t *testing.T) { } func TestValidator_AcceptsHttpAndHttps(t *testing.T) { - for _, base := range []string{"http://push.local:8080", "https://push.example.com"} { + // Literal public (documentation-range) IPs so the test is deterministic and + // never hits real DNS — Validate now does a set-time SSRF resolve for + // hostname base URLs. + for _, base := range []string{"http://203.0.113.10:8080", "https://203.0.113.10"} { body, _ := json.Marshal(Credentials{BaseURL: base}) if err := NewValidator().Validate(body); err != nil { t.Errorf("base_url=%q rejected: %v", base, err) @@ -34,6 +37,21 @@ func TestValidator_AcceptsHttpAndHttps(t *testing.T) { } } +func TestValidator_RejectsInternalBaseURL(t *testing.T) { + // SSRF guard: a tenant must not point the push base URL at an internal / + // reserved address. Literal IPs are rejected without DNS. + for _, base := range []string{ + "http://169.254.169.254", // cloud metadata + "http://127.0.0.1:8090", // loopback (the operator's local ntfy) + "http://10.0.0.5", // WireGuard mesh + } { + body, _ := json.Marshal(Credentials{BaseURL: base}) + if err := NewValidator().Validate(body); err == nil { + t.Errorf("internal base_url %q must be rejected (SSRF)", base) + } + } +} + func TestValidator_RejectsBadTopicMode(t *testing.T) { if err := NewValidator().Validate([]byte(`{"topic_mode":"random"}`)); err == nil { t.Error("expected rejection of unknown topic_mode") diff --git a/core/pkg/push/providers/ntfy/ntfy.go b/core/pkg/push/providers/ntfy/ntfy.go index adc96b6..4fde81f 100644 --- a/core/pkg/push/providers/ntfy/ntfy.go +++ b/core/pkg/push/providers/ntfy/ntfy.go @@ -74,15 +74,10 @@ func (p *Provider) Send(ctx context.Context, msg push.PushMessage) error { return fmt.Errorf("ntfy: base URL not configured") } - // URL-escape each path segment of the device token. ntfy topics can be - // hierarchical (e.g. "ns/myapp/user-1") and we want to preserve those - // '/' separators while escaping any other special characters that - // could let a malicious token escape the topic path. - parts := strings.Split(msg.DeviceToken, "/") - for i, p := range parts { - parts[i] = url.PathEscape(p) + endpointURL, err := p.resolveEndpoint(msg.DeviceToken) + if err != nil { + return err } - endpointURL := p.baseURL + "/" + strings.Join(parts, "/") req, err := http.NewRequestWithContext(ctx, http.MethodPost, endpointURL, strings.NewReader(msg.Body)) if err != nil { @@ -130,3 +125,58 @@ func (p *Provider) Send(ctx context.Context, msg push.PushMessage) error { _, _ = io.Copy(io.Discard, io.LimitReader(resp.Body, 4096)) return nil } + +// resolveEndpoint maps a device token to the ntfy publish URL. +// +// The token is one of two shapes: +// +// - A plain ntfy topic (possibly hierarchical, e.g. "ns/myapp/user-1") — +// published to "/", with each path segment escaped so a +// crafted token can't break out of the topic path. +// - A full UnifiedPush endpoint URL handed to the client by the ntfy +// distributor (e.g. "https://push.example.com/up"). UnifiedPush +// requires the application server to POST to that endpoint verbatim, so we +// use it as-is — but ONLY after verifying its scheme+host match the +// configured base URL. That check turns a device-supplied token into an +// SSRF only against our own push host, never an arbitrary one. +func (p *Provider) resolveEndpoint(token string) (string, error) { + topic := token + if isAbsoluteHTTPURL(token) { + u, err := url.Parse(token) + if err != nil { + return "", fmt.Errorf("ntfy: invalid endpoint url: %w", err) + } + base, err := url.Parse(p.baseURL) + if err != nil { + return "", fmt.Errorf("ntfy: invalid base url %q: %w", p.baseURL, err) + } + if !strings.EqualFold(u.Scheme, base.Scheme) || !strings.EqualFold(u.Host, base.Host) { + // Reject an endpoint pointing anywhere other than the configured + // push host — a device token must never become an SSRF vector. + return "", fmt.Errorf("ntfy: endpoint host %q does not match configured push host %q", u.Host, base.Host) + } + // Confine the URL form to the SAME publish surface as a bare topic: + // take only the path as the topic and re-build through the per-segment + // escaping below, dropping any query/fragment. So a UnifiedPush + // endpoint token can publish a topic but can't gain arbitrary path or + // query control on the push host beyond what a plain topic already has. + topic = strings.TrimPrefix(u.Path, "/") + if topic == "" { + return "", fmt.Errorf("ntfy: endpoint url %q has no topic path", token) + } + } + + // Escape each path segment, preserving the '/' hierarchy. + parts := strings.Split(topic, "/") + for i, seg := range parts { + parts[i] = url.PathEscape(seg) + } + return p.baseURL + "/" + strings.Join(parts, "/"), nil +} + +// isAbsoluteHTTPURL reports whether s looks like an absolute http(s) URL (the +// UnifiedPush endpoint form) rather than a bare ntfy topic. +func isAbsoluteHTTPURL(s string) bool { + lower := strings.ToLower(s) + return strings.HasPrefix(lower, "http://") || strings.HasPrefix(lower, "https://") +} diff --git a/core/pkg/push/providers/ntfy/ntfy_test.go b/core/pkg/push/providers/ntfy/ntfy_test.go index d6f08a3..a1d4586 100644 --- a/core/pkg/push/providers/ntfy/ntfy_test.go +++ b/core/pkg/push/providers/ntfy/ntfy_test.go @@ -7,6 +7,7 @@ import ( "io" "net/http" "net/http/httptest" + "net/url" "strings" "testing" "time" @@ -183,6 +184,108 @@ func TestSend_no_baseURL_returns_error(t *testing.T) { } } +// feat-32: an Android/GrapheneOS UnifiedPush device registers the full endpoint +// URL its distributor hands it. UnifiedPush requires the app server to POST to +// that endpoint verbatim, and we must do so ONLY when the host matches our +// configured push server (never an arbitrary host → no SSRF). + +func TestSend_unifiedPush_endpoint_published(t *testing.T) { + var gotPath, gotBody string + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + gotPath = r.URL.Path + b, _ := io.ReadAll(r.Body) + gotBody = string(b) + w.WriteHeader(http.StatusOK) + })) + defer srv.Close() + + p := New(Config{BaseURL: srv.URL}, nil) + // The distributor hands the client a full endpoint on the SAME (push) host. + endpoint := srv.URL + "/upAbc123" + if err := p.Send(context.Background(), push.PushMessage{DeviceToken: endpoint, Body: "payload"}); err != nil { + t.Fatalf("Send: %v", err) + } + if gotPath != "/upAbc123" { + t.Errorf("UnifiedPush endpoint must publish to its topic path; got %q", gotPath) + } + if gotBody != "payload" { + t.Errorf("body not delivered; got %q", gotBody) + } +} + +func TestSend_unifiedPush_endpoint_confined_to_topic(t *testing.T) { + // A URL token must be confined to the same publish surface as a bare topic: + // the path becomes the topic, and any query string is dropped — so it can't + // gain arbitrary path/query control on the push host. + var gotPath, gotQuery string + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + gotPath = r.URL.Path + gotQuery = r.URL.RawQuery + w.WriteHeader(http.StatusOK) + })) + defer srv.Close() + + p := New(Config{BaseURL: srv.URL}, nil) + endpoint := srv.URL + "/uptopic?admin=1&x=y" + if err := p.Send(context.Background(), push.PushMessage{DeviceToken: endpoint, Body: "x"}); err != nil { + t.Fatalf("Send: %v", err) + } + if gotPath != "/uptopic" { + t.Errorf("path must be the topic only; got %q", gotPath) + } + if gotQuery != "" { + t.Errorf("query string must be dropped (no arbitrary query on push host); got %q", gotQuery) + } +} + +func TestSend_unifiedPush_endpoint_rejects_userinfo_bypass(t *testing.T) { + // Classic SSRF guard bypass: smuggle the real host into userinfo. url.Parse + // resolves the authority to the attacker host, so it must be rejected. + hit := false + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + hit = true + w.WriteHeader(http.StatusOK) + })) + defer srv.Close() + + // base host = srv host; token tries "@attacker.example.com". + base, _ := url.Parse(srv.URL) + p := New(Config{BaseURL: srv.URL}, nil) + token := base.Scheme + "://" + base.Host + "@attacker.example.com/x" + if err := p.Send(context.Background(), push.PushMessage{DeviceToken: token, Body: "x"}); err == nil { + t.Fatal("expected rejection of a userinfo-smuggled host") + } + if hit { + t.Error("no request must be sent for a userinfo-bypass token") + } +} + +func TestSend_unifiedPush_endpoint_rejects_foreign_host(t *testing.T) { + hit := false + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + hit = true + w.WriteHeader(http.StatusOK) + })) + defer srv.Close() + + p := New(Config{BaseURL: srv.URL}, nil) + // A device token pointing at a DIFFERENT host must be rejected before any + // request is made — a device token must never become an SSRF vector. + err := p.Send(context.Background(), push.PushMessage{ + DeviceToken: "https://attacker.example.com/steal", + Body: "x", + }) + if err == nil { + t.Fatal("expected an error for an endpoint whose host doesn't match the push host") + } + if hit { + t.Error("no request must be sent when the endpoint host doesn't match") + } + if !strings.Contains(err.Error(), "does not match") { + t.Errorf("error should explain the host mismatch; got %v", err) + } +} + func TestName(t *testing.T) { p := New(Config{BaseURL: "http://x"}, nil) if p.Name() != "ntfy" { diff --git a/core/pkg/push/url_guard.go b/core/pkg/push/url_guard.go new file mode 100644 index 0000000..5435720 --- /dev/null +++ b/core/pkg/push/url_guard.go @@ -0,0 +1,193 @@ +package push + +import ( + "bytes" + "context" + "fmt" + "net" + "net/url" + "strings" + "time" +) + +// url_guard.go — SSRF guard for TENANT-supplied push base URLs. +// +// A tenant can override the ntfy base URL the gateway POSTs to (BYO-ntfy is a +// legitimate use case). Without a guard, a tenant could point it at an internal +// address — cloud metadata (169.254.169.254), the WireGuard mesh (10.0.0.x), +// loopback — turning the gateway's push sender into an SSRF proxy. These checks +// reject internal/reserved targets while still allowing real external hosts. +// +// IMPORTANT: apply these ONLY to tenant-supplied base URLs (the per-namespace +// override). The operator's gateway default (e.g. 127.0.0.1:8090, the local +// ntfy) is trusted and must NOT pass through here — it would be (correctly) +// rejected as loopback. + +// baseURLDNSTimeout bounds the hostname-resolution step in CheckBaseURLResolvable. +const baseURLDNSTimeout = 5 * time.Second + +// lookupIP resolves a host to its IPs. A package var so tests can substitute a +// deterministic resolver instead of touching real DNS. +var lookupIP = func(ctx context.Context, host string) ([]net.IP, error) { + addrs, err := net.DefaultResolver.LookupIPAddr(ctx, host) + if err != nil { + return nil, err + } + ips := make([]net.IP, len(addrs)) + for i, a := range addrs { + ips[i] = a.IP + } + return ips, nil +} + +// CheckBaseURLSyntax validates a tenant base URL's scheme and rejects a host +// that is a LITERAL internal/reserved IP. It does NOT resolve hostnames, so it +// is safe to call on hot paths (e.g. per-send dispatcher construction). An +// empty base URL is allowed — it means "use the gateway default". +func CheckBaseURLSyntax(baseURL string) error { + if baseURL == "" { + return nil + } + u, err := url.Parse(baseURL) + if err != nil { + return fmt.Errorf("base_url: invalid URL: %w", err) + } + if u.Scheme != "http" && u.Scheme != "https" { + return fmt.Errorf("base_url: must start with http:// or https:// (got scheme %q)", u.Scheme) + } + host := u.Hostname() + if host == "" { + return fmt.Errorf("base_url: missing host") + } + if ip := net.ParseIP(host); ip != nil { + if isReservedIP(ip) { + return fmt.Errorf("base_url: host %s is a reserved/internal address and is not allowed", host) + } + return nil + } + // net.ParseIP only accepts canonical dotted-decimal / standard IPv6, but the + // OS resolver + net.Dial ALSO accept decimal ("2130706433"), hex + // ("0x7f000001") and octal ("0177.0.0.1") IPv4 encodings — a literal-check + // bypass to internal addresses. Reject these non-standard numeric hosts + // outright (no legitimate push host is all-numeric or 0x-hex). + if looksLikeNumericHost(host) { + return fmt.Errorf("base_url: host %q is a non-standard numeric/IP encoding and is not allowed", host) + } + return nil +} + +// CheckBaseURLResolvable runs CheckBaseURLSyntax AND, when the host is a name +// rather than a literal IP, resolves it (bounded) and rejects if ANY resolved +// address is internal/reserved — blocking a tenant from pointing a domain at an +// internal host. It performs DNS, so call it ONLY at config-set time (the PUT +// handlers), never on the hot send path. +// +// Resolution failure FAILS OPEN (allowed): an unresolvable host reaches nothing +// (delivery would fail anyway), and rejecting it would break a legitimate host +// that's momentarily unresolvable at config time. The hard floor is +// CheckBaseURLSyntax's literal-IP block, which applies on every code path. +// +// Residual: as a set-time check it does not defend against DNS rebinding (the +// host re-pointing to an internal IP AFTER it was accepted). Closing that would +// require a send-time IP check, which is complicated here by the operator's +// loopback default ntfy. +func CheckBaseURLResolvable(ctx context.Context, baseURL string) error { + if err := CheckBaseURLSyntax(baseURL); err != nil { + return err + } + if baseURL == "" { + return nil + } + u, _ := url.Parse(baseURL) // already validated by CheckBaseURLSyntax + host := u.Hostname() + if net.ParseIP(host) != nil { + return nil // literal IP already vetted by CheckBaseURLSyntax + } + + rctx, cancel := context.WithTimeout(ctx, baseURLDNSTimeout) + defer cancel() + ips, err := lookupIP(rctx, host) + if err != nil || len(ips) == 0 { + return nil // fail open on resolution failure (see doc) + } + for _, ip := range ips { + if isReservedIP(ip) { + return fmt.Errorf("base_url: host %q resolves to reserved/internal address %s and is not allowed", host, ip) + } + } + return nil +} + +// IsInternalBaseURL reports whether baseURL parses to a host that is a LITERAL +// internal/reserved IP. Malformed URLs and hostname URLs return false — this is +// the no-false-positive guard for hot paths (e.g. dispatcher build), where the +// goal is only to drop an internal-address override, not to re-validate syntax +// or do DNS (the set-path handlers cover those). +func IsInternalBaseURL(baseURL string) bool { + u, err := url.Parse(baseURL) + if err != nil { + return false + } + host := u.Hostname() + if ip := net.ParseIP(host); ip != nil { + return isReservedIP(ip) + } + // Non-standard numeric encodings (decimal/hex/octal) that net.ParseIP misses + // but net.Dial resolves to an IP — treat as internal so the build-path guard + // matches what the dialer would actually reach. + return looksLikeNumericHost(host) +} + +// isReservedIP reports whether ip is in a range a tenant must never be able to +// reach via a push base URL: loopback, link-local (incl. 169.254.169.254 cloud +// metadata), RFC1918 private, ULA, unspecified, multicast, and 100.64/10 CGNAT. +func isReservedIP(ip net.IP) bool { + if ip == nil { + return true // unparseable → treat as unsafe + } + if ip4 := ip.To4(); ip4 != nil { + // 100.64.0.0/10 — carrier-grade NAT (not covered by IsPrivate). The + // second-octet band [64,127] is the /10. + if ip4[0] == 100 && ip4[1] >= 64 && ip4[1] <= 127 { + return true + } + } else if ip16 := ip.To16(); ip16 != nil { + // NAT64 well-known prefix 64:ff9b::/96 (RFC 6052) embeds an IPv4 address + // a NAT64 gateway would translate — so it can reach internal v4. + if bytes.Equal(ip16[:12], []byte{0x00, 0x64, 0xff, 0x9b, 0, 0, 0, 0, 0, 0, 0, 0}) { + return true + } + } + return ip.IsLoopback() || + ip.IsLinkLocalUnicast() || + ip.IsLinkLocalMulticast() || + ip.IsInterfaceLocalMulticast() || + ip.IsMulticast() || + ip.IsPrivate() || // 10/8, 172.16/12, 192.168/16, fc00::/7 + ip.IsUnspecified() +} + +// looksLikeNumericHost reports whether host is a non-standard numeric IPv4 +// encoding — hex ("0x7f000001", "0x7f.0.0.1"), decimal ("2130706433"), or octal +// ("0177.0.0.1") — that net.ParseIP rejects but the OS resolver and net.Dial +// accept (resolving to a real, possibly internal, IPv4). Such hosts are never a +// legitimate push server name, so callers reject them rather than let them slip +// past the literal-IP guard. Hosts containing any letter (other than a leading +// "0x") are treated as ordinary DNS names and return false. +func looksLikeNumericHost(host string) bool { + if host == "" { + return false + } + if strings.HasPrefix(strings.ToLower(host), "0x") { + return true // hex literal + } + // All-numeric (optionally dotted) host that net.ParseIP already failed to + // accept: a decimal or octal IPv4 encoding (or a malformed all-numeric + // dotted form). Either way, not a real hostname. + for _, r := range host { + if r != '.' && (r < '0' || r > '9') { + return false + } + } + return true +} diff --git a/core/pkg/push/url_guard_test.go b/core/pkg/push/url_guard_test.go new file mode 100644 index 0000000..09db391 --- /dev/null +++ b/core/pkg/push/url_guard_test.go @@ -0,0 +1,160 @@ +package push + +import ( + "context" + "errors" + "net" + "testing" +) + +// SSRF guard for tenant push base URLs. These pin: literal internal/reserved IPs +// are rejected, the cloud-metadata IP is rejected, legit external hosts pass, +// and a hostname that RESOLVES to an internal address is rejected (the DNS +// vector) while a public-resolving host passes. + +func TestCheckBaseURLSyntax(t *testing.T) { + cases := []struct { + url string + wantErr bool + }{ + {"", false}, // empty = use default + {"https://push.example.com", false}, // public host + {"http://push.example.com:8090", false}, + {"https://1.1.1.1", false}, // public literal IP + {"https://[2606:4700:4700::1111]", false}, // public v6 + {"ftp://push.example.com", true}, // bad scheme + {"notaurl", true}, // no scheme/host + {"http://", true}, // missing host + {"http://169.254.169.254", true}, // cloud metadata (link-local) + {"http://127.0.0.1", true}, // loopback + {"http://127.0.0.1:8090", true}, // loopback + port + {"http://10.0.0.5", true}, // RFC1918 (WireGuard mesh) + {"http://192.168.1.1", true}, // RFC1918 + {"http://172.16.0.1", true}, // RFC1918 + {"http://100.64.0.1", true}, // CGNAT + {"http://0.0.0.0", true}, // unspecified + {"http://[::1]", true}, // v6 loopback + {"http://[fd00::1]", true}, // v6 ULA + {"http://[64:ff9b::a00:5]", true}, // NAT64-embedded 10.0.0.5 + {"http://0x7f000001", true}, // hex-encoded 127.0.0.1 + {"http://2130706433", true}, // decimal-encoded 127.0.0.1 + {"http://0177.0.0.1", true}, // octal-encoded 127.0.0.1 + } + for _, tc := range cases { + err := CheckBaseURLSyntax(tc.url) + if tc.wantErr && err == nil { + t.Errorf("CheckBaseURLSyntax(%q) = nil; want error", tc.url) + } + if !tc.wantErr && err != nil { + t.Errorf("CheckBaseURLSyntax(%q) = %v; want nil", tc.url, err) + } + } +} + +func TestIsReservedIP(t *testing.T) { + reserved := []string{ + "127.0.0.1", "169.254.169.254", "10.0.0.1", "172.16.5.5", "192.168.0.1", + "100.64.0.1", "100.100.100.200", "0.0.0.0", "224.0.0.1", "::1", "fe80::1", + "fd00::1", "ff02::1", + "64:ff9b::a00:1", // NAT64-embedded 10.0.0.1 + "64:ff9b::a9fe:a9fe", // NAT64-embedded 169.254.169.254 (metadata) + } + public := []string{"1.1.1.1", "8.8.8.8", "203.0.113.10", "2606:4700:4700::1111"} + for _, s := range reserved { + if ip := net.ParseIP(s); !isReservedIP(ip) { + t.Errorf("isReservedIP(%s) = false; want true (reserved)", s) + } + } + for _, s := range public { + if ip := net.ParseIP(s); isReservedIP(ip) { + t.Errorf("isReservedIP(%s) = true; want false (public)", s) + } + } + if !isReservedIP(nil) { + t.Error("isReservedIP(nil) must be true (unparseable → unsafe)") + } +} + +func TestIsInternalBaseURL(t *testing.T) { + internal := []string{ + "http://10.0.0.5", "http://169.254.169.254", + "https://127.0.0.1:8090", "http://[::1]", "http://192.168.1.1", + "http://[64:ff9b::a00:5]", // NAT64 + "http://0x7f000001", // hex-encoded loopback + "http://2130706433", // decimal-encoded loopback + "http://0177.0.0.1", // octal-encoded loopback + } + notInternal := []string{ + "https://push.example.com", // hostname → false (the set path resolves it) + "https://1.1.1.1", // public literal IP + "ns-A-url", // malformed placeholder → must NOT be dropped + "v1", "", "not a url", + } + for _, s := range internal { + if !IsInternalBaseURL(s) { + t.Errorf("IsInternalBaseURL(%q) = false; want true (internal literal IP)", s) + } + } + for _, s := range notInternal { + if IsInternalBaseURL(s) { + t.Errorf("IsInternalBaseURL(%q) = true; want false", s) + } + } +} + +func TestCheckBaseURLResolvable(t *testing.T) { + orig := lookupIP + defer func() { lookupIP = orig }() + + t.Run("hostname resolving to internal is rejected", func(t *testing.T) { + lookupIP = func(_ context.Context, host string) ([]net.IP, error) { + return []net.IP{net.ParseIP("10.0.0.7")}, nil // points at the mesh + } + if err := CheckBaseURLResolvable(context.Background(), "https://evil.example.com"); err == nil { + t.Fatal("expected rejection of a host resolving to an internal address") + } + }) + + t.Run("hostname resolving to public is allowed", func(t *testing.T) { + lookupIP = func(_ context.Context, host string) ([]net.IP, error) { + return []net.IP{net.ParseIP("203.0.113.50")}, nil + } + if err := CheckBaseURLResolvable(context.Background(), "https://push.example.com"); err != nil { + t.Fatalf("public-resolving host should pass: %v", err) + } + }) + + t.Run("any internal IP among results is rejected", func(t *testing.T) { + lookupIP = func(_ context.Context, host string) ([]net.IP, error) { + return []net.IP{net.ParseIP("203.0.113.50"), net.ParseIP("127.0.0.1")}, nil + } + if err := CheckBaseURLResolvable(context.Background(), "https://mixed.example.com"); err == nil { + t.Fatal("a host resolving to ANY internal address must be rejected") + } + }) + + t.Run("resolution failure is allowed (fail open)", func(t *testing.T) { + lookupIP = func(_ context.Context, host string) ([]net.IP, error) { + return nil, errors.New("nxdomain") + } + if err := CheckBaseURLResolvable(context.Background(), "https://unresolvable.example.com"); err != nil { + t.Fatalf("an unresolvable host should fail open (be allowed); got %v", err) + } + }) + + t.Run("literal internal IP rejected without DNS", func(t *testing.T) { + lookupIP = func(_ context.Context, host string) ([]net.IP, error) { + t.Fatal("DNS must not be consulted for a literal IP host") + return nil, nil + } + if err := CheckBaseURLResolvable(context.Background(), "http://169.254.169.254"); err == nil { + t.Fatal("literal metadata IP must be rejected") + } + }) + + t.Run("empty is allowed", func(t *testing.T) { + if err := CheckBaseURLResolvable(context.Background(), ""); err != nil { + t.Fatalf("empty base_url should pass: %v", err) + } + }) +} diff --git a/core/pkg/serverless/engine.go b/core/pkg/serverless/engine.go index 58c4529..9f62995 100644 --- a/core/pkg/serverless/engine.go +++ b/core/pkg/serverless/engine.go @@ -2,6 +2,7 @@ package serverless import ( "context" + cryptorand "crypto/rand" "errors" "fmt" "time" @@ -318,6 +319,15 @@ func (e *Engine) Execute(ctx context.Context, fn *Function, input []byte, invCtx // gates invocation frequency, not per-invocation host-call volume). execCtx = WithPublishCounter(execCtx) + // Raw-HTTP-response mode (bugboard #835). Only RawHTTPResponse functions + // get a collector attached — set_http_response is a validated no-op for + // every other function (no collector → host call returns an error). The + // collector rides execCtx so concurrent invocations never cross-write, + // matching the publish-counter / log-buffer per-call model. + if fn.RawHTTPResponse { + execCtx = WithRawHTTPCollector(execCtx) + } + // Get compiled module (from cache or compile) module, err := e.getOrCompileModule(execCtx, fn.WASMCID) if err != nil { @@ -346,6 +356,14 @@ func (e *Engine) Execute(ctx context.Context, fn *Function, input []byte, invCtx return nil, &ExecutionError{FunctionName: fn.Name, RequestID: invCtx.RequestID, Cause: err} } + // Surface any verbatim HTTP response the function set (bugboard #835) + // onto invCtx so the Invoker → HTTP handler can replay it. Only + // RawHTTPResponse functions have a collector attached; TakeRawHTTPResponse + // returns (_, false) otherwise. + if res, ok := TakeRawHTTPResponse(execCtx); ok { + invCtx.RawHTTP = &res + } + e.logInvocation(ctx, fn, invCtx, logBuf, startTime, len(output), InvocationStatusSuccess, nil) e.logSlowInvocation(invCtx, startTime, ratelimitDoneAt, moduleLoadedAt, executeDoneAt, "success", nil) return output, nil @@ -547,7 +565,13 @@ func (e *Engine) InstantiatePersistent(ctx context.Context, fn *Function, invCtx // into real clocks via the documented wazero hook — same effect as // the runtime would get on a normal Go process. WithSysWalltime(). - WithSysNanotime() + WithSysNanotime(). + // Bugboard #120 — same class as #27. Without WithRandSource, wazero's + // default RNG is deterministic (zero seed), so TinyGo crypto/rand.Read + // returns identical bytes on every fresh instance — constant codes / + // nonces / tokens. Wire in the host CSPRNG. Same fix at + // execution/executor.go for the stateless path. + WithRandSource(cryptorand.Reader) instance, err := e.runtime.InstantiateModule(ctx, compiled, moduleConfig) if err != nil { @@ -742,6 +766,7 @@ func (e *Engine) registerHostModule(ctx context.Context) error { NewFunctionBuilder().WithFunc(e.hCacheIncrBy).Export("cache_incr_by"). NewFunctionBuilder().WithFunc(e.hHTTPFetch).Export("http_fetch"). NewFunctionBuilder().WithFunc(e.hAnyoneFetch).Export("anyone_fetch"). + NewFunctionBuilder().WithFunc(e.hSetHTTPResponse).Export("set_http_response"). NewFunctionBuilder().WithFunc(e.hPubSubPublish).Export("pubsub_publish"). NewFunctionBuilder().WithFunc(e.hPubSubPublishBatch).Export("pubsub_publish_batch"). NewFunctionBuilder().WithFunc(e.hPushSend).Export("push_send"). @@ -751,6 +776,8 @@ func (e *Engine) registerHostModule(ctx context.Context) error { NewFunctionBuilder().WithFunc(e.hWSPubSubUnbridge).Export("ws_pubsub_unbridge"). NewFunctionBuilder().WithFunc(e.hWSSend).Export("ws_send"). NewFunctionBuilder().WithFunc(e.hWSBroadcast).Export("ws_broadcast"). + NewFunctionBuilder().WithFunc(e.hEphemeralStateSet).Export("ephemeral_state_set"). + NewFunctionBuilder().WithFunc(e.hEphemeralStateClear).Export("ephemeral_state_clear"). NewFunctionBuilder().WithFunc(e.hFunctionInvoke).Export("function_invoke"). NewFunctionBuilder().WithFunc(e.hFunctionInvokeAsync).Export("function_invoke_async"). NewFunctionBuilder().WithFunc(e.hLogInfo).Export("log_info"). @@ -948,6 +975,40 @@ func (e *Engine) hHTTPFetch(ctx context.Context, mod api.Module, methodPtr, meth return e.executor.WriteToGuest(ctx, mod, resp) } +// hSetHTTPResponse is the WASM-callable wrapper for SetHTTPResponse — +// bugboard #835 raw-HTTP-response mode. +// +// ABI: set_http_response(status i32, headersJSONPtr, headersJSONLen, +// bodyPtr, bodyLen uint32) -> uint32. headersJSON (when non-empty) is a JSON +// object of string→string. Returns 1 on success, 0 on failure (function not +// deployed with raw_http_response, bad status, oversized headers/body, or a +// guest-memory read error). +func (e *Engine) hSetHTTPResponse(ctx context.Context, mod api.Module, + status, headersPtr, headersLen, bodyPtr, bodyLen uint32) uint32 { + var headers map[string]string + if headersLen > 0 { + if err := e.executor.UnmarshalJSONFromGuest(mod, headersPtr, headersLen, &headers); err != nil { + e.logger.Warn("set_http_response: failed to unmarshal headers", zap.Error(err)) + return 0 + } + } + + var body []byte + if bodyLen > 0 { + b, ok := e.executor.ReadFromGuest(mod, bodyPtr, bodyLen) + if !ok { + return 0 + } + body = b + } + + if err := e.hostServices.SetHTTPResponse(ctx, int(status), headers, body); err != nil { + e.logger.Warn("host function set_http_response failed", zap.Error(err)) + return 0 + } + return 1 +} + // hAnyoneFetch is the WASM-callable wrapper for AnyoneFetch — feat-11. // Identical ABI to hHTTPFetch (method, url, headers JSON, body), routes // through the Anyone SOCKS5 proxy. Returns packed (ptr<<32 | len) to the @@ -1291,6 +1352,67 @@ func (e *Engine) hWSBroadcast(ctx context.Context, mod api.Module, return 1 } +// hEphemeralStateSet is the WASM-callable wrapper for EphemeralStateSet — +// bugboard #710 WS-subscribe-tracked ephemeral state. +// +// ABI: ephemeral_state_set(topicPtr, topicLen, keyPtr, keyLen, payloadPtr, +// payloadLen uint32, ttlMs int64) -> uint32. Returns 1 on success, 0 on +// failure (no WS client in context, empty topic/key, oversized payload, +// per-client key cap, or a guest-memory read error). +func (e *Engine) hEphemeralStateSet(ctx context.Context, mod api.Module, + topicPtr, topicLen, keyPtr, keyLen, payloadPtr, payloadLen uint32, ttlMs int64) uint32 { + topic, ok := e.executor.ReadFromGuest(mod, topicPtr, topicLen) + if !ok { + return 0 + } + key, ok := e.executor.ReadFromGuest(mod, keyPtr, keyLen) + if !ok { + return 0 + } + var payload []byte + if payloadLen > 0 { + p, ok := e.executor.ReadFromGuest(mod, payloadPtr, payloadLen) + if !ok { + return 0 + } + payload = p + } + if err := e.hostServices.EphemeralStateSet(ctx, string(topic), string(key), payload, ttlMs); err != nil { + e.logger.Warn("host function ephemeral_state_set failed", + zap.String("topic", string(topic)), + zap.String("key", string(key)), + zap.Error(err)) + return 0 + } + return 1 +} + +// hEphemeralStateClear is the WASM-callable wrapper for EphemeralStateClear. +// +// ABI: ephemeral_state_clear(topicPtr, topicLen, keyPtr, keyLen uint32) -> +// uint32. Returns 1 on success (including idempotent clears of a missing key), +// 0 on failure (no WS client in context, empty topic/key, or a guest-memory +// read error). +func (e *Engine) hEphemeralStateClear(ctx context.Context, mod api.Module, + topicPtr, topicLen, keyPtr, keyLen uint32) uint32 { + topic, ok := e.executor.ReadFromGuest(mod, topicPtr, topicLen) + if !ok { + return 0 + } + key, ok := e.executor.ReadFromGuest(mod, keyPtr, keyLen) + if !ok { + return 0 + } + if err := e.hostServices.EphemeralStateClear(ctx, string(topic), string(key)); err != nil { + e.logger.Warn("host function ephemeral_state_clear failed", + zap.String("topic", string(topic)), + zap.String("key", string(key)), + zap.Error(err)) + return 0 + } + return 1 +} + // hPushSend is the WASM-callable wrapper for PushSend. // Inputs: // userIDPtr/userIDLen — UTF-8 user ID to push to (within the function's diff --git a/core/pkg/serverless/ephemeral_disconnect_test.go b/core/pkg/serverless/ephemeral_disconnect_test.go new file mode 100644 index 0000000..31e898f --- /dev/null +++ b/core/pkg/serverless/ephemeral_disconnect_test.go @@ -0,0 +1,52 @@ +package serverless + +import ( + "context" + "testing" + + "go.uber.org/zap" +) + +// fakeWSConn is a no-op WebSocketConn for exercising WSManager lifecycle. +type fakeWSConn struct{} + +func (fakeWSConn) WriteMessage(int, []byte) error { return nil } +func (fakeWSConn) ReadMessage() (int, []byte, error) { return 0, nil, nil } +func (fakeWSConn) Close() error { return nil } + +// TestWSManager_DisconnectHookClearsEphemeralState verifies the wiring that +// makes Feature #710's auto-clear work: a disconnect hook registered on the +// WSManager fires on Unregister, clearing the disconnecting client's ephemeral +// state. Both the stateless and persistent WS handlers call Unregister, so +// this single hook covers both paths. +func TestWSManager_DisconnectHookClearsEphemeralState(t *testing.T) { + logger := zap.NewNop() + wsm := NewWSManager(logger) + pub := &capturePublisher{} + store := NewEphemeralStore(pub.publish) + + // Wire the hook exactly as NewHostFunctions does. + wsm.AddDisconnectHook(func(clientID string) { + store.ClearClient(context.Background(), clientID) + }) + + clientID := "client-A" + wsm.Register(clientID, fakeWSConn{}) + + if err := store.Set(context.Background(), "ns1", clientID, "t", "k", []byte("p"), 0); err != nil { + t.Fatalf("Set: %v", err) + } + if store.keyCountForTest() != 1 { + t.Fatalf("expected 1 key before disconnect, got %d", store.keyCountForTest()) + } + + // Disconnect → hook fires → state cleared + synthetic clear published. + wsm.Unregister(clientID) + + if store.keyCountForTest() != 0 { + t.Errorf("disconnect hook did not clear ephemeral state, count=%d", store.keyCountForTest()) + } + if pub.countKind(EphemeralEventClear) != 1 { + t.Errorf("expected 1 synthetic clear on disconnect, got %d", pub.countKind(EphemeralEventClear)) + } +} diff --git a/core/pkg/serverless/ephemeral_state.go b/core/pkg/serverless/ephemeral_state.go new file mode 100644 index 0000000..590e66c --- /dev/null +++ b/core/pkg/serverless/ephemeral_state.go @@ -0,0 +1,352 @@ +package serverless + +import ( + "context" + "encoding/json" + "fmt" + "sync" + "time" +) + +// WS-subscribe-tracked ephemeral state primitive (bugboard #710). +// +// A serverless function can publish short-lived per-subscriber state (typing +// indicators, "online" flags, cursor positions, …) keyed by (topic, key) and +// have the gateway AUTO-CLEAR that state the moment the owning WebSocket +// client disconnects — publishing a synthetic clear event so every subscriber +// sees the state vanish with zero cron lag. State also expires on a TTL as a +// backstop. +// +// Ownership model: each set is tagged with the CURRENT invocation's WS client +// ID (the same source GetWSClientID reads). On disconnect the store iterates +// that client's owned (topic,key) entries, publishes a clear event for each, +// and drops them. A client's disconnect never touches another client's state. + +const ( + // ephemeralMaxKeysPerClient caps how many distinct (topic,key) entries a + // single WS client may own at once. Bounds the per-client memory + the + // fan-out of synthetic clears on disconnect. + ephemeralMaxKeysPerClient = 256 + + // ephemeralMaxPayloadBytes caps a single ephemeral payload. Generous for + // presence/typing/cursor metadata while bounding gateway memory. + ephemeralMaxPayloadBytes = 16 << 10 // 16 KiB + + // ephemeralMaxTTL caps the requested TTL. Ephemeral state is meant to be + // short-lived; the disconnect hook is the primary cleanup path and the TTL + // is only a backstop, so a long TTL is never useful. + ephemeralMaxTTL = 30 * time.Minute + + // ephemeralDefaultTTL is applied when a caller passes ttlMs <= 0. + ephemeralDefaultTTL = 60 * time.Second + + // ephemeralSweepInterval is how often the backstop sweeper scans for + // expired entries. The disconnect hook handles the common case; the + // sweeper only catches entries whose owner is still connected but whose + // TTL elapsed. + ephemeralSweepInterval = 10 * time.Second +) + +// EphemeralEventKind discriminates the synthetic events published on a topic. +type EphemeralEventKind string + +const ( + EphemeralEventSet EphemeralEventKind = "set" + EphemeralEventClear EphemeralEventKind = "clear" +) + +// EphemeralEvent is the wire shape published on the topic when ephemeral state +// is set, cleared, or auto-cleared on disconnect/expiry. Subscribers key off +// Kind + Key to update their local view. Payload is only populated for "set". +type EphemeralEvent struct { + Type string `json:"__ephemeral"` // always "state" + Kind EphemeralEventKind `json:"kind"` // set | clear + Key string `json:"key"` // app-chosen key + ClientID string `json:"client_id"` // owning WS client + // Payload is the opaque app-chosen blob (may be JSON, protobuf, or + // arbitrary bytes), present only for "set". encoding/json base64-encodes + // a []byte on the wire, so subscribers base64-decode "payload" to recover + // the original bytes — mirroring how pubsub_publish_batch carries data. + Payload []byte `json:"payload,omitempty"` + Reason string `json:"reason,omitempty"` // clear only: explicit|disconnect|expired +} + +// ephemeralPublisher publishes data on a (namespace, topic). Abstracted so the +// store can publish synthetic clears without depending on the concrete pubsub +// adapter type — and so tests can capture published events. Namespace handling +// matches the host pubsub path: the adapter namespaces internally, so this +// publisher receives the already-namespaced caller's topic verbatim. +type ephemeralPublisher func(ctx context.Context, namespace, topic string, data []byte) error + +// ephemeralEntry is one stored value plus its expiry and the metadata needed +// to publish a clear event for it. +type ephemeralEntry struct { + namespace string + topic string + key string + clientID string + payload []byte + expiresAt time.Time +} + +// ephemeralStateKey identifies a stored value across namespaces/topics. +type ephemeralStateKey struct { + namespace string + topic string + key string +} + +// EphemeralStore holds WS-subscribe-tracked ephemeral state with auto-clear on +// disconnect (bugboard #710). Safe for concurrent use. +type EphemeralStore struct { + publish ephemeralPublisher + + mu sync.Mutex + // values keyed by (ns, topic, key). + values map[ephemeralStateKey]*ephemeralEntry + // owned maps a clientID to the set of state keys it owns, for O(1) + // disconnect cleanup. + owned map[string]map[ephemeralStateKey]struct{} + + // sweeper lifecycle. + stopOnce sync.Once + stopCh chan struct{} + now func() time.Time // injectable clock for tests +} + +// NewEphemeralStore constructs a store with the given publisher. The publisher +// may be nil (set/clear then skip publishing) — useful in tests, but in +// production the host wires the pubsub adapter so subscribers see events. +func NewEphemeralStore(publish ephemeralPublisher) *EphemeralStore { + return &EphemeralStore{ + publish: publish, + values: make(map[ephemeralStateKey]*ephemeralEntry), + owned: make(map[string]map[ephemeralStateKey]struct{}), + stopCh: make(chan struct{}), + now: time.Now, + } +} + +// Set records an ephemeral value owned by clientID and publishes a "set" event +// on the topic so subscribers observe it. Returns an error on validation +// failure (empty client/topic/key, oversized payload, per-client cap reached). +func (s *EphemeralStore) Set(ctx context.Context, namespace, clientID, topic, key string, payload []byte, ttlMs int64) error { + if clientID == "" { + return fmt.Errorf("ephemeral_state_set: requires a WebSocket client (no ws_client_id in invocation context)") + } + if topic == "" || key == "" { + return fmt.Errorf("ephemeral_state_set: topic and key are required") + } + if len(payload) > ephemeralMaxPayloadBytes { + return fmt.Errorf("ephemeral_state_set: payload too large (%d > %d bytes)", len(payload), ephemeralMaxPayloadBytes) + } + + ttl := time.Duration(ttlMs) * time.Millisecond + if ttl <= 0 { + ttl = ephemeralDefaultTTL + } + if ttl > ephemeralMaxTTL { + ttl = ephemeralMaxTTL + } + + sk := ephemeralStateKey{namespace: namespace, topic: topic, key: key} + payloadCopy := make([]byte, len(payload)) + copy(payloadCopy, payload) + + s.mu.Lock() + ownedSet := s.owned[clientID] + // Enforce the per-client cap only for NEW keys this client doesn't already + // own — overwriting an existing key must always be allowed. + if _, alreadyOwned := s.values[sk]; !alreadyOwned || s.values[sk].clientID != clientID { + if len(ownedSet) >= ephemeralMaxKeysPerClient { + s.mu.Unlock() + return fmt.Errorf("ephemeral_state_set: client %s exceeded max %d ephemeral keys", clientID, ephemeralMaxKeysPerClient) + } + } + + // If a different client owned this exact (ns,topic,key), transfer ownership + // — drop it from the previous owner's set so its disconnect won't clear + // state it no longer owns. + if prev, ok := s.values[sk]; ok && prev.clientID != clientID { + if prevSet := s.owned[prev.clientID]; prevSet != nil { + delete(prevSet, sk) + if len(prevSet) == 0 { + delete(s.owned, prev.clientID) + } + } + } + + s.values[sk] = &ephemeralEntry{ + namespace: namespace, + topic: topic, + key: key, + clientID: clientID, + payload: payloadCopy, + expiresAt: s.now().Add(ttl), + } + if ownedSet == nil { + ownedSet = make(map[ephemeralStateKey]struct{}) + s.owned[clientID] = ownedSet + } + ownedSet[sk] = struct{}{} + s.mu.Unlock() + + evt := EphemeralEvent{ + Type: "state", + Kind: EphemeralEventSet, + Key: key, + ClientID: clientID, + Payload: payloadCopy, + } + return s.publishEvent(ctx, namespace, topic, evt) +} + +// Clear removes an ephemeral value the client owns and publishes a "clear" +// event with reason "explicit". Clearing a key the client does not own (or a +// missing key) is a no-op that still returns nil — clears are idempotent. +func (s *EphemeralStore) Clear(ctx context.Context, namespace, clientID, topic, key string) error { + if clientID == "" { + return fmt.Errorf("ephemeral_state_clear: requires a WebSocket client (no ws_client_id in invocation context)") + } + if topic == "" || key == "" { + return fmt.Errorf("ephemeral_state_clear: topic and key are required") + } + + sk := ephemeralStateKey{namespace: namespace, topic: topic, key: key} + + s.mu.Lock() + entry, ok := s.values[sk] + if !ok || entry.clientID != clientID { + // Not present, or owned by someone else — idempotent no-op. + s.mu.Unlock() + return nil + } + s.removeLocked(sk, entry) + s.mu.Unlock() + + return s.publishEvent(ctx, namespace, topic, EphemeralEvent{ + Type: "state", + Kind: EphemeralEventClear, + Key: key, + ClientID: clientID, + Reason: "explicit", + }) +} + +// ClearClient removes every entry owned by clientID and publishes a clear +// event for each (reason "disconnect"). Called from the WS disconnect hook — +// the primary, zero-lag cleanup path. Safe to call for an unknown client. +func (s *EphemeralStore) ClearClient(ctx context.Context, clientID string) { + s.clearClientWithReason(ctx, clientID, "disconnect") +} + +func (s *EphemeralStore) clearClientWithReason(ctx context.Context, clientID, reason string) { + s.mu.Lock() + ownedSet := s.owned[clientID] + if len(ownedSet) == 0 { + delete(s.owned, clientID) + s.mu.Unlock() + return + } + // Snapshot entries to publish after releasing the lock. + toClear := make([]*ephemeralEntry, 0, len(ownedSet)) + for sk := range ownedSet { + if entry, ok := s.values[sk]; ok { + toClear = append(toClear, entry) + delete(s.values, sk) + } + } + delete(s.owned, clientID) + s.mu.Unlock() + + for _, entry := range toClear { + _ = s.publishEvent(ctx, entry.namespace, entry.topic, EphemeralEvent{ + Type: "state", + Kind: EphemeralEventClear, + Key: entry.key, + ClientID: clientID, + Reason: reason, + }) + } +} + +// removeLocked drops one entry from both maps. Caller holds s.mu. +func (s *EphemeralStore) removeLocked(sk ephemeralStateKey, entry *ephemeralEntry) { + delete(s.values, sk) + if set := s.owned[entry.clientID]; set != nil { + delete(set, sk) + if len(set) == 0 { + delete(s.owned, entry.clientID) + } + } +} + +// publishEvent marshals and publishes a synthetic event. No-op (nil) when no +// publisher is wired. +func (s *EphemeralStore) publishEvent(ctx context.Context, namespace, topic string, evt EphemeralEvent) error { + if s.publish == nil { + return nil + } + data, err := json.Marshal(evt) + if err != nil { + return fmt.Errorf("ephemeral state: marshal event: %w", err) + } + if err := s.publish(ctx, namespace, topic, data); err != nil { + return fmt.Errorf("ephemeral state: publish %s event: %w", evt.Kind, err) + } + return nil +} + +// StartSweeper launches the TTL backstop sweeper. Idempotent guards aren't +// provided — call exactly once. Stop with StopSweeper. +func (s *EphemeralStore) StartSweeper() { + go func() { + ticker := time.NewTicker(ephemeralSweepInterval) + defer ticker.Stop() + for { + select { + case <-s.stopCh: + return + case <-ticker.C: + s.sweepExpired(context.Background()) + } + } + }() +} + +// StopSweeper stops the backstop sweeper. Safe to call multiple times. +func (s *EphemeralStore) StopSweeper() { + s.stopOnce.Do(func() { close(s.stopCh) }) +} + +// sweepExpired removes and publishes clears for every entry whose TTL elapsed. +func (s *EphemeralStore) sweepExpired(ctx context.Context) { + now := s.now() + + s.mu.Lock() + var expired []*ephemeralEntry + for sk, entry := range s.values { + if now.After(entry.expiresAt) { + expired = append(expired, entry) + s.removeLocked(sk, entry) + } + } + s.mu.Unlock() + + for _, entry := range expired { + _ = s.publishEvent(ctx, entry.namespace, entry.topic, EphemeralEvent{ + Type: "state", + Kind: EphemeralEventClear, + Key: entry.key, + ClientID: entry.clientID, + Reason: "expired", + }) + } +} + +// keyCountForTest returns the number of stored values (test-only accessor). +func (s *EphemeralStore) keyCountForTest() int { + s.mu.Lock() + defer s.mu.Unlock() + return len(s.values) +} diff --git a/core/pkg/serverless/ephemeral_state_test.go b/core/pkg/serverless/ephemeral_state_test.go new file mode 100644 index 0000000..ba6bb98 --- /dev/null +++ b/core/pkg/serverless/ephemeral_state_test.go @@ -0,0 +1,295 @@ +package serverless + +import ( + "context" + "encoding/json" + "fmt" + "sync" + "testing" + "time" +) + +// capturePublisher records every published event for assertions. +type capturePublisher struct { + mu sync.Mutex + events []capturedEvent +} + +type capturedEvent struct { + namespace string + topic string + event EphemeralEvent +} + +func (c *capturePublisher) publish(_ context.Context, namespace, topic string, data []byte) error { + var evt EphemeralEvent + if err := json.Unmarshal(data, &evt); err != nil { + return err + } + c.mu.Lock() + c.events = append(c.events, capturedEvent{namespace: namespace, topic: topic, event: evt}) + c.mu.Unlock() + return nil +} + +func (c *capturePublisher) snapshot() []capturedEvent { + c.mu.Lock() + defer c.mu.Unlock() + out := make([]capturedEvent, len(c.events)) + copy(out, c.events) + return out +} + +func (c *capturePublisher) countKind(kind EphemeralEventKind) int { + c.mu.Lock() + defer c.mu.Unlock() + n := 0 + for _, e := range c.events { + if e.event.Kind == kind { + n++ + } + } + return n +} + +func newTestStore(pub ephemeralPublisher) *EphemeralStore { + s := NewEphemeralStore(pub) + return s +} + +func TestEphemeralStore_SetThenClear(t *testing.T) { + pub := &capturePublisher{} + s := newTestStore(pub.publish) + ctx := context.Background() + + if err := s.Set(ctx, "ns1", "client-A", "typing:room1", "k1", []byte(`{"typing":true}`), 0); err != nil { + t.Fatalf("Set: %v", err) + } + if s.keyCountForTest() != 1 { + t.Fatalf("expected 1 stored key, got %d", s.keyCountForTest()) + } + + if err := s.Clear(ctx, "ns1", "client-A", "typing:room1", "k1"); err != nil { + t.Fatalf("Clear: %v", err) + } + if s.keyCountForTest() != 0 { + t.Errorf("expected 0 stored keys after clear, got %d", s.keyCountForTest()) + } + + if got := pub.countKind(EphemeralEventSet); got != 1 { + t.Errorf("set events = %d, want 1", got) + } + if got := pub.countKind(EphemeralEventClear); got != 1 { + t.Errorf("clear events = %d, want 1", got) + } + // The set event must carry the payload verbatim. + evts := pub.snapshot() + if string(evts[0].event.Payload) != `{"typing":true}` { + t.Errorf("set payload = %q, want the original JSON", evts[0].event.Payload) + } + if evts[1].event.Reason != "explicit" { + t.Errorf("clear reason = %q, want explicit", evts[1].event.Reason) + } +} + +func TestEphemeralStore_SetThenDisconnect(t *testing.T) { + pub := &capturePublisher{} + s := newTestStore(pub.publish) + ctx := context.Background() + + if err := s.Set(ctx, "ns1", "client-A", "topicX", "kA", []byte("p1"), 0); err != nil { + t.Fatalf("Set kA: %v", err) + } + if err := s.Set(ctx, "ns1", "client-A", "topicY", "kB", []byte("p2"), 0); err != nil { + t.Fatalf("Set kB: %v", err) + } + + s.ClearClient(ctx, "client-A") + + if s.keyCountForTest() != 0 { + t.Errorf("expected all state dropped on disconnect, got %d", s.keyCountForTest()) + } + // One synthetic clear per owned key, all reason=disconnect. + if got := pub.countKind(EphemeralEventClear); got != 2 { + t.Errorf("disconnect clear events = %d, want 2", got) + } + for _, e := range pub.snapshot() { + if e.event.Kind == EphemeralEventClear && e.event.Reason != "disconnect" { + t.Errorf("clear reason = %q, want disconnect", e.event.Reason) + } + } +} + +func TestEphemeralStore_TTLExpiry(t *testing.T) { + pub := &capturePublisher{} + s := newTestStore(pub.publish) + ctx := context.Background() + + // Freeze the clock so we control expiry deterministically. + base := time.Now() + s.now = func() time.Time { return base } + + if err := s.Set(ctx, "ns1", "client-A", "topicX", "kA", []byte("p"), 1000); err != nil { + t.Fatalf("Set: %v", err) + } + + // Before expiry: sweep is a no-op. + s.sweepExpired(ctx) + if s.keyCountForTest() != 1 { + t.Fatalf("entry expired too early, count=%d", s.keyCountForTest()) + } + + // Advance past the 1s TTL and sweep. + s.now = func() time.Time { return base.Add(2 * time.Second) } + s.sweepExpired(ctx) + if s.keyCountForTest() != 0 { + t.Errorf("entry not swept after TTL, count=%d", s.keyCountForTest()) + } + + // A clear event with reason=expired must have been published. + foundExpired := false + for _, e := range pub.snapshot() { + if e.event.Kind == EphemeralEventClear && e.event.Reason == "expired" { + foundExpired = true + } + } + if !foundExpired { + t.Error("expected a clear event with reason=expired") + } +} + +func TestEphemeralStore_TTLClampedToMax(t *testing.T) { + pub := &capturePublisher{} + s := newTestStore(pub.publish) + base := time.Now() + s.now = func() time.Time { return base } + + // Request a TTL far beyond the max; it must be clamped. + huge := (ephemeralMaxTTL + time.Hour).Milliseconds() + if err := s.Set(context.Background(), "ns1", "c", "t", "k", []byte("p"), huge); err != nil { + t.Fatalf("Set: %v", err) + } + s.mu.Lock() + entry := s.values[ephemeralStateKey{namespace: "ns1", topic: "t", key: "k"}] + s.mu.Unlock() + if entry == nil { + t.Fatal("entry missing") + } + maxExpiry := base.Add(ephemeralMaxTTL) + if entry.expiresAt.After(maxExpiry) { + t.Errorf("TTL not clamped: expiresAt %v after max %v", entry.expiresAt, maxExpiry) + } +} + +func TestEphemeralStore_PerClientCapEnforced(t *testing.T) { + pub := &capturePublisher{} + s := newTestStore(pub.publish) + ctx := context.Background() + + for i := 0; i < ephemeralMaxKeysPerClient; i++ { + if err := s.Set(ctx, "ns1", "client-A", "t", fmt.Sprintf("k%d", i), []byte("p"), 0); err != nil { + t.Fatalf("Set #%d: %v", i, err) + } + } + // The next NEW key must be rejected. + err := s.Set(ctx, "ns1", "client-A", "t", "overflow", []byte("p"), 0) + if err == nil { + t.Fatal("expected per-client cap error") + } + if s.keyCountForTest() != ephemeralMaxKeysPerClient { + t.Errorf("stored keys = %d, want %d (overflow must not be stored)", s.keyCountForTest(), ephemeralMaxKeysPerClient) + } + + // Overwriting an EXISTING key must still succeed even at the cap. + if err := s.Set(ctx, "ns1", "client-A", "t", "k0", []byte("updated"), 0); err != nil { + t.Errorf("overwrite at cap rejected: %v", err) + } +} + +func TestEphemeralStore_ClientIsolation(t *testing.T) { + pub := &capturePublisher{} + s := newTestStore(pub.publish) + ctx := context.Background() + + if err := s.Set(ctx, "ns1", "client-A", "t", "kA", []byte("a"), 0); err != nil { + t.Fatalf("Set A: %v", err) + } + if err := s.Set(ctx, "ns1", "client-B", "t", "kB", []byte("b"), 0); err != nil { + t.Fatalf("Set B: %v", err) + } + + // Disconnecting A must NOT touch B's state. + s.ClearClient(ctx, "client-A") + if s.keyCountForTest() != 1 { + t.Fatalf("expected B's single key to survive A's disconnect, got %d", s.keyCountForTest()) + } + s.mu.Lock() + _, bSurvives := s.values[ephemeralStateKey{namespace: "ns1", topic: "t", key: "kB"}] + s.mu.Unlock() + if !bSurvives { + t.Error("client-B's state was wrongly cleared by client-A's disconnect") + } + + // A also cannot clear B's key (not the owner): idempotent no-op. + if err := s.Clear(ctx, "ns1", "client-A", "t", "kB"); err != nil { + t.Fatalf("cross-client Clear should be a no-op, got err: %v", err) + } + if s.keyCountForTest() != 1 { + t.Error("client-A managed to clear client-B's key") + } +} + +func TestEphemeralStore_SetValidation(t *testing.T) { + s := newTestStore(nil) + ctx := context.Background() + + if err := s.Set(ctx, "ns1", "", "t", "k", nil, 0); err == nil { + t.Error("expected error for empty client ID") + } + if err := s.Set(ctx, "ns1", "c", "", "k", nil, 0); err == nil { + t.Error("expected error for empty topic") + } + if err := s.Set(ctx, "ns1", "c", "t", "", nil, 0); err == nil { + t.Error("expected error for empty key") + } + big := make([]byte, ephemeralMaxPayloadBytes+1) + if err := s.Set(ctx, "ns1", "c", "t", "k", big, 0); err == nil { + t.Error("expected error for oversized payload") + } +} + +func TestEphemeralStore_ClearClientUnknownIsNoOp(t *testing.T) { + pub := &capturePublisher{} + s := newTestStore(pub.publish) + // No panic, no events for an unknown client. + s.ClearClient(context.Background(), "nobody") + if len(pub.snapshot()) != 0 { + t.Error("ClearClient on unknown client should publish nothing") + } +} + +func TestEphemeralStore_OwnershipTransfer(t *testing.T) { + pub := &capturePublisher{} + s := newTestStore(pub.publish) + ctx := context.Background() + + // client-A sets, then client-B overwrites the SAME (topic,key). + if err := s.Set(ctx, "ns1", "client-A", "t", "shared", []byte("a"), 0); err != nil { + t.Fatalf("Set A: %v", err) + } + if err := s.Set(ctx, "ns1", "client-B", "t", "shared", []byte("b"), 0); err != nil { + t.Fatalf("Set B: %v", err) + } + + // A's disconnect must NOT clear the key now owned by B. + s.ClearClient(ctx, "client-A") + if s.keyCountForTest() != 1 { + t.Errorf("ownership transfer failed: key dropped on prior owner's disconnect, count=%d", s.keyCountForTest()) + } + + // B's disconnect clears it. + s.ClearClient(ctx, "client-B") + if s.keyCountForTest() != 0 { + t.Errorf("new owner's disconnect did not clear, count=%d", s.keyCountForTest()) + } +} diff --git a/core/pkg/serverless/execution/executor.go b/core/pkg/serverless/execution/executor.go index 465335d..177e62c 100644 --- a/core/pkg/serverless/execution/executor.go +++ b/core/pkg/serverless/execution/executor.go @@ -3,6 +3,7 @@ package execution import ( "bytes" "context" + cryptorand "crypto/rand" "encoding/json" "fmt" @@ -80,7 +81,15 @@ func (e *Executor) ExecuteModule(ctx context.Context, compiled wazero.CompiledMo // invocation that uses time.Now() (receipts, audit rows, cursor cmp). // Same fix applied at engine.go for the persistent-WS path. WithSysWalltime(). - WithSysNanotime() + WithSysNanotime(). + // Bugboard #120 — same class as #27. Without WithRandSource, wazero + // uses a deterministic zero-seed RNG, so TinyGo's crypto/rand.Read + // returns IDENTICAL bytes on every fresh instance (and every + // invocation is a fresh instance). That makes any unguessable ID / + // code / nonce / token constant. Wire in the host CSPRNG so + // crypto/rand (and auto-seeded math/rand) work. Same fix at + // engine.go for the persistent-WS path. + WithRandSource(cryptorand.Reader) // Acquire concurrency slot if e.sem != nil { diff --git a/core/pkg/serverless/execution/randsource_test.go b/core/pkg/serverless/execution/randsource_test.go new file mode 100644 index 0000000..576fec0 --- /dev/null +++ b/core/pkg/serverless/execution/randsource_test.go @@ -0,0 +1,181 @@ +package execution + +import ( + "context" + cryptorand "crypto/rand" + "encoding/binary" + "testing" + + "github.com/tetratelabs/wazero" + "github.com/tetratelabs/wazero/imports/wasi_snapshot_preview1" +) + +// Bugboard #120 — wazero defaults to a DETERMINISTIC (zero-seed) RNG source. +// TinyGo wasm's crypto/rand.Read calls WASI random_get, so without +// .WithRandSource(crypto/rand.Reader) every fresh instance gets the IDENTICAL +// "random" byte sequence. Each serverless invocation is a fresh instance, so +// any unguessable code / nonce / token a function generates is constant (the +// observed "8LRJ2S on every rotate" symptom). +// +// The fix is .WithRandSource(cryptorand.Reader) on BOTH wazero moduleConfig +// builders — executor.go (stateless) and engine.go (persistent WS). This test +// pins the executor's config path: instantiate the SAME config twice and assert +// the two instances produce DIFFERENT random bytes. +// +// If a future refactor drops .WithRandSource(), the positive test fails with a +// clear message; the negative control documents why the fix is necessary. + +// randProbeWasm is a hand-assembled WASM module that imports +// wasi_snapshot_preview1.random_get and calls it from _start, writing 8 random +// bytes to memory[0:8]. +// +// (module +// (type $random_get (func (param i32 i32) (result i32))) +// (type $start (func)) +// (import "wasi_snapshot_preview1" "random_get" +// (func $random_get (type 0))) +// (memory (export "memory") 1) +// (func $_start (type 1) +// i32.const 0 ;; buf = 0 +// i32.const 8 ;; buf_len = 8 +// call $random_get +// drop) +// (export "_start" (func $_start))) +var randProbeWasm = []byte{ + // Magic + version + 0x00, 0x61, 0x73, 0x6d, 0x01, 0x00, 0x00, 0x00, + + // Type section (id=1) — body=10 bytes + 0x01, + 0x0a, + 0x02, // 2 types + 0x60, 0x02, 0x7f, 0x7f, // type 0: func(i32, i32) + 0x01, 0x7f, // -> (i32) + 0x60, 0x00, 0x00, // type 1: func() -> () + + // Import section (id=2) — body=0x25 (37 bytes) + 0x02, + 0x25, + 0x01, // 1 import + 0x16, // module name "wasi_snapshot_preview1" length=22 + 0x77, 0x61, 0x73, 0x69, 0x5f, 0x73, 0x6e, 0x61, 0x70, 0x73, 0x68, 0x6f, 0x74, 0x5f, 0x70, 0x72, 0x65, 0x76, 0x69, 0x65, 0x77, 0x31, + 0x0a, // fn name "random_get" length=10 + 0x72, 0x61, 0x6e, 0x64, 0x6f, 0x6d, 0x5f, 0x67, 0x65, 0x74, + 0x00, 0x00, // kind=func, type idx=0 + + // Function section (id=3) — body=2 bytes + 0x03, + 0x02, + 0x01, // 1 function + 0x01, // type idx 1 (for _start) + + // Memory section (id=5) — body=3 bytes + 0x05, + 0x03, + 0x01, // 1 memory + 0x00, 0x01, // limits: flags=0 (no max), min=1 page + + // Export section (id=7) — body=19 bytes (0x13) + 0x07, + 0x13, + 0x02, // 2 exports + 0x06, 0x6d, 0x65, 0x6d, 0x6f, 0x72, 0x79, // "memory" + 0x02, 0x00, // kind=memory, idx=0 + 0x06, 0x5f, 0x73, 0x74, 0x61, 0x72, 0x74, // "_start" + 0x00, 0x01, // kind=func, idx=1 (after the 1 import) + + // Code section (id=10) — body=11 bytes (0x0b) + 0x0a, + 0x0b, + 0x01, // 1 function body + 0x09, // body size = 9 + 0x00, // 0 local groups + 0x41, 0x00, // i32.const 0 (buf) + 0x41, 0x08, // i32.const 8 (buf_len) + 0x10, 0x00, // call func 0 (the imported random_get) + 0x1a, // drop (errno return) + 0x0b, // end +} + +// readProbeRandom instantiates randProbeWasm once with the given moduleConfig +// transform and returns the 8 random bytes written to memory[0:8]. +func readProbeRandom(t *testing.T, runtime wazero.Runtime, compiled wazero.CompiledModule, cfg wazero.ModuleConfig) uint64 { + t.Helper() + ctx := context.Background() + mod, err := runtime.InstantiateModule(ctx, compiled, cfg) + if err != nil { + t.Fatalf("instantiate probe module: %v", err) + } + defer mod.Close(ctx) + raw, ok := mod.Memory().Read(0, 8) + if !ok { + t.Fatal("could not read 8 bytes from probe memory at offset 0") + } + return binary.LittleEndian.Uint64(raw) +} + +func TestModuleConfig_randSourceIsRealNotDeterministic(t *testing.T) { + ctx := context.Background() + runtime := wazero.NewRuntime(ctx) + defer runtime.Close(ctx) + + if _, err := wasi_snapshot_preview1.Instantiate(ctx, runtime); err != nil { + t.Fatalf("instantiate WASI: %v", err) + } + compiled, err := runtime.CompileModule(ctx, randProbeWasm) + if err != nil { + t.Fatalf("compile probe wasm: %v (hex assembly likely off; recompute section sizes)", err) + } + defer compiled.Close(ctx) + + // Mirror the executor.go moduleConfig — anonymous instance, real RNG. Two + // separate instantiations of the SAME config must produce different bytes. + newCfg := func() wazero.ModuleConfig { + return wazero.NewModuleConfig(). + WithName(""). + WithArgs("probe"). + WithSysWalltime(). + WithSysNanotime(). + WithRandSource(cryptorand.Reader) + } + + a := readProbeRandom(t, runtime, compiled, newCfg()) + b := readProbeRandom(t, runtime, compiled, newCfg()) + if a == b { + t.Errorf("BUG #120 REGRESSION: two fresh instances produced IDENTICAL random "+ + "bytes (%#016x) — crypto/rand is deterministic. Did the "+ + ".WithRandSource(cryptorand.Reader) call get dropped from moduleConfig "+ + "in executor.go or engine.go?", a) + } +} + +func TestModuleConfig_randWithoutFix_demoDeterministic(t *testing.T) { + // Negative control: WITHOUT .WithRandSource(), confirm wazero's default RNG + // is deterministic (identical bytes across fresh instances). This pins the + // *cause*. If wazero ever defaults to a real entropy source, this test + // fails — making the change visible instead of silently invalidating the + // fix's necessity. + ctx := context.Background() + runtime := wazero.NewRuntime(ctx) + defer runtime.Close(ctx) + + if _, err := wasi_snapshot_preview1.Instantiate(ctx, runtime); err != nil { + t.Fatalf("instantiate WASI: %v", err) + } + compiled, err := runtime.CompileModule(ctx, randProbeWasm) + if err != nil { + t.Fatalf("compile probe wasm: %v", err) + } + defer compiled.Close(ctx) + + newDefault := func() wazero.ModuleConfig { + return wazero.NewModuleConfig().WithName("").WithArgs("probe") + } + a := readProbeRandom(t, runtime, compiled, newDefault()) + b := readProbeRandom(t, runtime, compiled, newDefault()) + if a != b { + t.Skipf("wazero default RandSource now differs across instances (%#016x vs %#016x) — "+ + "if real-by-default upstream, the bug-#120 fix may be redundant; review", a, b) + } + // Determinism confirmed → fix is meaningful. +} diff --git a/core/pkg/serverless/hostfuncs_test.go b/core/pkg/serverless/hostfuncs_test.go index 2565398..9c13fed 100644 --- a/core/pkg/serverless/hostfuncs_test.go +++ b/core/pkg/serverless/hostfuncs_test.go @@ -134,6 +134,18 @@ func (m *mockHostServices) WSPubSubUnbridge(ctx context.Context, clientID, topic return nil } +func (m *mockHostServices) SetHTTPResponse(ctx context.Context, status int, headers map[string]string, body []byte) error { + return SetRawHTTPResponse(ctx, status, headers, body) +} + +func (m *mockHostServices) EphemeralStateSet(ctx context.Context, topic, key string, payload []byte, ttlMs int64) error { + return nil +} + +func (m *mockHostServices) EphemeralStateClear(ctx context.Context, topic, key string) error { + return nil +} + func (m *mockHostServices) WSSend(ctx context.Context, clientID string, data []byte) error { return nil } diff --git a/core/pkg/serverless/hostfunctions/host_services.go b/core/pkg/serverless/hostfunctions/host_services.go index 2b25cd9..b38056f 100644 --- a/core/pkg/serverless/hostfunctions/host_services.go +++ b/core/pkg/serverless/hostfunctions/host_services.go @@ -1,6 +1,7 @@ package hostfunctions import ( + "context" "net/http" "time" @@ -57,7 +58,7 @@ func NewHostFunctions( anyoneHTTPClient.Timeout = httpTimeout } - return &HostFunctions{ + hf := &HostFunctions{ db: db, cacheClient: cacheClient, storage: storage, @@ -77,4 +78,28 @@ func NewHostFunctions( logs: make([]serverless.LogEntry, 0), asyncInvokeSem: make(chan struct{}, asyncInvokeMaxInFlight), } + + // Ephemeral-state store (bugboard #710). Publishes synthetic set/clear + // events through the same pubsub adapter the pubsub_publish host fn uses, + // and registers a WS disconnect hook so a client's owned state auto-clears + // the instant its WebSocket drops — zero cron lag. Only wired when a + // concrete WSManager is present (the disconnect hook + sweeper need it); + // otherwise ephemeral_state_set returns an error. + if wsm, ok := wsManager.(*serverless.WSManager); ok && wsm != nil { + var publish func(ctx context.Context, namespace, topic string, data []byte) error + if pubsubAdapter != nil { + publish = func(ctx context.Context, _ string, topic string, data []byte) error { + // The adapter namespaces internally (same as PubSubPublish), so + // the namespace arg is informational only here. + return pubsubAdapter.Publish(ctx, topic, data) + } + } + hf.ephemeralStore = serverless.NewEphemeralStore(publish) + wsm.AddDisconnectHook(func(clientID string) { + hf.ephemeralStore.ClearClient(context.Background(), clientID) + }) + hf.ephemeralStore.StartSweeper() + } + + return hf } diff --git a/core/pkg/serverless/hostfunctions/http.go b/core/pkg/serverless/hostfunctions/http.go index 2efd0aa..9d4a933 100644 --- a/core/pkg/serverless/hostfunctions/http.go +++ b/core/pkg/serverless/hostfunctions/http.go @@ -17,6 +17,18 @@ func (h *HostFunctions) HTTPFetch(ctx context.Context, method, url string, heade return h.doFetch(ctx, "http_fetch", h.httpClient, method, url, headers, body) } +// SetHTTPResponse records a verbatim HTTP response for a RawHTTPResponse +// function (bugboard #835). It delegates to the per-invocation collector +// attached on ctx by the engine; the HTTP invoke handler replays the result +// byte-for-byte. Validation (raw mode enabled, status range, header/body caps) +// lives in serverless.SetRawHTTPResponse. +func (h *HostFunctions) SetHTTPResponse(ctx context.Context, status int, headers map[string]string, body []byte) error { + if err := serverless.SetRawHTTPResponse(ctx, status, headers, body); err != nil { + return &serverless.HostFunctionError{Function: "set_http_response", Cause: err} + } + return nil +} + // AnyoneFetch makes an outbound HTTP request routed through the Anyone // (ANyONe protocol) SOCKS5 proxy, so the third-party endpoint sees an // Anyone exit IP instead of the gateway IP and the gateway can't diff --git a/core/pkg/serverless/hostfunctions/pubsub.go b/core/pkg/serverless/hostfunctions/pubsub.go index 5df0556..dd05d6b 100644 --- a/core/pkg/serverless/hostfunctions/pubsub.go +++ b/core/pkg/serverless/hostfunctions/pubsub.go @@ -186,6 +186,40 @@ func dedupBatchByTopic(msgs []pubsub.TopicMessage) []pubsub.TopicMessage { return out } +// EphemeralStateSet records WS-subscribe-tracked ephemeral state for the +// current invocation's WS client and publishes a "set" event (bugboard #710). +// The owning client ID and namespace are derived from the invocation context — +// the function cannot spoof them. Auto-clears on the client's WS disconnect. +func (h *HostFunctions) EphemeralStateSet(ctx context.Context, topic, key string, payload []byte, ttlMs int64) error { + if h.ephemeralStore == nil { + return &serverless.HostFunctionError{Function: "ephemeral_state_set", Cause: fmt.Errorf("ephemeral state not available on this gateway")} + } + cur := h.currentInvocationContext(ctx) + if cur == nil { + return &serverless.HostFunctionError{Function: "ephemeral_state_set", Cause: fmt.Errorf("no invocation context")} + } + if err := h.ephemeralStore.Set(ctx, cur.Namespace, cur.WSClientID, topic, key, payload, ttlMs); err != nil { + return &serverless.HostFunctionError{Function: "ephemeral_state_set", Cause: err} + } + return nil +} + +// EphemeralStateClear removes ephemeral state the current WS client owns and +// publishes a "clear" event (bugboard #710). Idempotent. +func (h *HostFunctions) EphemeralStateClear(ctx context.Context, topic, key string) error { + if h.ephemeralStore == nil { + return &serverless.HostFunctionError{Function: "ephemeral_state_clear", Cause: fmt.Errorf("ephemeral state not available on this gateway")} + } + cur := h.currentInvocationContext(ctx) + if cur == nil { + return &serverless.HostFunctionError{Function: "ephemeral_state_clear", Cause: fmt.Errorf("no invocation context")} + } + if err := h.ephemeralStore.Clear(ctx, cur.Namespace, cur.WSClientID, topic, key); err != nil { + return &serverless.HostFunctionError{Function: "ephemeral_state_clear", Cause: err} + } + return nil +} + // WSSend sends data to a specific WebSocket client. func (h *HostFunctions) WSSend(ctx context.Context, clientID string, data []byte) error { if h.wsManager == nil { diff --git a/core/pkg/serverless/hostfunctions/secrets.go b/core/pkg/serverless/hostfunctions/secrets.go index c87019d..5dce599 100644 --- a/core/pkg/serverless/hostfunctions/secrets.go +++ b/core/pkg/serverless/hostfunctions/secrets.go @@ -14,6 +14,9 @@ import ( "go.uber.org/zap" ) +// secretsKeyBytes is the required length of the AES-256 encryption key. +const secretsKeyBytes = 32 + // DBSecretsManager implements SecretsManager using the database. type DBSecretsManager struct { db rqlite.Client @@ -25,21 +28,34 @@ type DBSecretsManager struct { var _ serverless.SecretsManager = (*DBSecretsManager)(nil) // NewDBSecretsManager creates a secrets manager backed by the database. -func NewDBSecretsManager(db rqlite.Client, encryptionKeyHex string, logger *zap.Logger) (*DBSecretsManager, error) { +// +// encryptionKeyHex must be a 32-byte AES-256 key, hex-encoded (64 chars). +// +// When encryptionKeyHex is empty the behaviour depends on allowEphemeral: +// - allowEphemeral=false (production): returns an error. A misconfigured +// node must fail loudly rather than silently generate a per-process +// ephemeral key. With an ephemeral key, secrets encrypted by one +// process cannot be decrypted by another (or after a restart), which +// makes get_secret return garbage/errors (bugboard #837). +// - allowEphemeral=true (tests/dev): generates a random per-process key +// and logs a warning. Secrets will not persist across restarts. +func NewDBSecretsManager(db rqlite.Client, encryptionKeyHex string, allowEphemeral bool, logger *zap.Logger) (*DBSecretsManager, error) { var key []byte if encryptionKeyHex != "" { var err error key, err = hex.DecodeString(encryptionKeyHex) - if err != nil || len(key) != 32 { - return nil, fmt.Errorf("invalid encryption key: must be 32 bytes hex-encoded") + if err != nil || len(key) != secretsKeyBytes { + return nil, fmt.Errorf("invalid secrets encryption key: must be %d bytes hex-encoded (%d hex chars)", secretsKeyBytes, secretsKeyBytes*2) } - } else { - // Generate a random key if none provided - key = make([]byte, 32) + } else if allowEphemeral { + // Generate a random per-process key (dev/test only). + key = make([]byte, secretsKeyBytes) if _, err := rand.Read(key); err != nil { - return nil, fmt.Errorf("failed to generate encryption key: %w", err) + return nil, fmt.Errorf("failed to generate ephemeral secrets encryption key: %w", err) } - logger.Warn("Generated random secrets encryption key - secrets will not persist across restarts") + logger.Warn("Generated random ephemeral secrets encryption key - secrets will NOT persist across restarts (dev/test only)") + } else { + return nil, fmt.Errorf("secrets encryption key is required: set secrets_encryption_key (see %s/secrets/secrets-encryption-key); without it secrets cannot be decrypted across processes or restarts (bugboard #837)", "~/.orama") } return &DBSecretsManager{ diff --git a/core/pkg/serverless/hostfunctions/secrets_test.go b/core/pkg/serverless/hostfunctions/secrets_test.go new file mode 100644 index 0000000..4ad1f70 --- /dev/null +++ b/core/pkg/serverless/hostfunctions/secrets_test.go @@ -0,0 +1,199 @@ +package hostfunctions + +import ( + "context" + "database/sql" + "errors" + "strings" + "testing" + + "github.com/DeBrosOfficial/network/pkg/rqlite" + "github.com/DeBrosOfficial/network/pkg/serverless" + "go.uber.org/zap" +) + +// fakeSecretsDB is an in-memory rqlite.Client stub that implements only the +// Exec/Query paths used by DBSecretsManager (INSERT...ON CONFLICT upsert and +// SELECT by namespace+name). Storing the encrypted blob in a map lets us +// round-trip a Set through a Get — the core of the bugboard #837 regression. +type fakeSecretsDB struct { + rqlite.Client + store map[string][]byte // key: namespace\x00name -> encrypted_value +} + +func newFakeSecretsDB() *fakeSecretsDB { + return &fakeSecretsDB{store: map[string][]byte{}} +} + +func storeKey(namespace, name string) string { + return namespace + "\x00" + name +} + +// Exec handles the upsert. args order matches secrets.go Set(): +// (id, namespace, name, encrypted_value, created_at, updated_at). +func (f *fakeSecretsDB) Exec(ctx context.Context, query string, args ...any) (sql.Result, error) { + if strings.Contains(query, "INSERT INTO function_secrets") { + namespace, _ := args[1].(string) + name, _ := args[2].(string) + enc, _ := args[3].([]byte) + cp := make([]byte, len(enc)) + copy(cp, enc) + f.store[storeKey(namespace, name)] = cp + return fakeResult{rows: 1}, nil + } + return fakeResult{}, nil +} + +// Query handles the SELECT encrypted_value ... WHERE namespace=? AND name=?. +func (f *fakeSecretsDB) Query(ctx context.Context, dest any, query string, args ...any) error { + if !strings.Contains(query, "SELECT encrypted_value") { + return errors.New("unexpected query") + } + namespace, _ := args[0].(string) + name, _ := args[1].(string) + rows, ok := dest.(*[]struct { + EncryptedValue []byte `db:"encrypted_value"` + }) + if !ok { + return errors.New("unexpected dest type") + } + if enc, found := f.store[storeKey(namespace, name)]; found { + *rows = append(*rows, struct { + EncryptedValue []byte `db:"encrypted_value"` + }{EncryptedValue: enc}) + } + return nil +} + +type fakeResult struct{ rows int64 } + +func (r fakeResult) LastInsertId() (int64, error) { return 0, nil } +func (r fakeResult) RowsAffected() (int64, error) { return r.rows, nil } + +// validKey is a 32-byte AES-256 key, hex-encoded (64 chars). +const validKey = "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef" + +// otherKey is a different valid 32-byte key. +const otherKey = "fedcba9876543210fedcba9876543210fedcba9876543210fedcba9876543210" + +// TestDBSecretsManager_SetGetRoundTrip_sameKey proves the fix: a secret +// encrypted with a fixed key is decryptable by a SEPARATE manager constructed +// with the SAME key (simulating another process / a restart). +func TestDBSecretsManager_SetGetRoundTrip_sameKey(t *testing.T) { + db := newFakeSecretsDB() + logger := zap.NewNop() + ctx := context.Background() + + writer, err := NewDBSecretsManager(db, validKey, false, logger) + if err != nil { + t.Fatalf("NewDBSecretsManager (writer) failed: %v", err) + } + if err := writer.Set(ctx, "ns1", "API_TOKEN", "s3cr3t-value"); err != nil { + t.Fatalf("Set failed: %v", err) + } + + // A fresh manager with the SAME key (different process / post-restart). + reader, err := NewDBSecretsManager(db, validKey, false, logger) + if err != nil { + t.Fatalf("NewDBSecretsManager (reader) failed: %v", err) + } + got, err := reader.Get(ctx, "ns1", "API_TOKEN") + if err != nil { + t.Fatalf("Get failed: %v", err) + } + if got != "s3cr3t-value" { + t.Errorf("Get returned %q, want %q", got, "s3cr3t-value") + } +} + +// TestDBSecretsManager_GetWithDifferentKey_fails proves the bug it guards +// against: a manager with a DIFFERENT key cannot decrypt — exactly what +// happened when each process generated its own ephemeral key (bugboard #837). +func TestDBSecretsManager_GetWithDifferentKey_fails(t *testing.T) { + db := newFakeSecretsDB() + logger := zap.NewNop() + ctx := context.Background() + + writer, err := NewDBSecretsManager(db, validKey, false, logger) + if err != nil { + t.Fatalf("NewDBSecretsManager (writer) failed: %v", err) + } + if err := writer.Set(ctx, "ns1", "API_TOKEN", "s3cr3t-value"); err != nil { + t.Fatalf("Set failed: %v", err) + } + + reader, err := NewDBSecretsManager(db, otherKey, false, logger) + if err != nil { + t.Fatalf("NewDBSecretsManager (reader) failed: %v", err) + } + if _, err := reader.Get(ctx, "ns1", "API_TOKEN"); err == nil { + t.Fatal("expected decryption to fail with a different key, got nil error") + } +} + +// TestDBSecretsManager_emptyKey_isLoud verifies the production constructor +// refuses to start with an empty key (allowEphemeral=false) instead of +// silently generating an undecryptable ephemeral key. +func TestDBSecretsManager_emptyKey_isLoud(t *testing.T) { + db := newFakeSecretsDB() + _, err := NewDBSecretsManager(db, "", false, zap.NewNop()) + if err == nil { + t.Fatal("expected error for empty key with allowEphemeral=false, got nil") + } + if !strings.Contains(err.Error(), "secrets encryption key is required") { + t.Errorf("unexpected error message: %v", err) + } +} + +// TestDBSecretsManager_emptyKey_ephemeralAllowed verifies tests/dev can still +// opt into a per-process ephemeral key. +func TestDBSecretsManager_emptyKey_ephemeralAllowed(t *testing.T) { + db := newFakeSecretsDB() + mgr, err := NewDBSecretsManager(db, "", true, zap.NewNop()) + if err != nil { + t.Fatalf("expected ephemeral key to be allowed, got error: %v", err) + } + // Ephemeral key still round-trips within the same process. + ctx := context.Background() + if err := mgr.Set(ctx, "ns1", "K", "v"); err != nil { + t.Fatalf("Set failed: %v", err) + } + got, err := mgr.Get(ctx, "ns1", "K") + if err != nil { + t.Fatalf("Get failed: %v", err) + } + if got != "v" { + t.Errorf("Get returned %q, want %q", got, "v") + } +} + +// TestDBSecretsManager_invalidKey_rejected covers malformed keys (wrong +// length, non-hex) at the boundary. +func TestDBSecretsManager_invalidKey_rejected(t *testing.T) { + db := newFakeSecretsDB() + cases := map[string]string{ + "too short": "abcd", + "odd hex": "abc", + "not hex": strings.Repeat("zz", 32), + "wrong bytes": "0123456789abcdef", // 8 bytes, not 32 + } + for name, key := range cases { + t.Run(name, func(t *testing.T) { + if _, err := NewDBSecretsManager(db, key, false, zap.NewNop()); err == nil { + t.Fatalf("expected error for invalid key %q, got nil", key) + } + }) + } +} + +// TestDBSecretsManager_Get_notFound verifies the not-found sentinel survives. +func TestDBSecretsManager_Get_notFound(t *testing.T) { + db := newFakeSecretsDB() + mgr, err := NewDBSecretsManager(db, validKey, false, zap.NewNop()) + if err != nil { + t.Fatalf("NewDBSecretsManager failed: %v", err) + } + if _, err := mgr.Get(context.Background(), "ns1", "missing"); !errors.Is(err, serverless.ErrSecretNotFound) { + t.Errorf("expected ErrSecretNotFound, got %v", err) + } +} diff --git a/core/pkg/serverless/hostfunctions/types.go b/core/pkg/serverless/hostfunctions/types.go index b082ee3..1fa9216 100644 --- a/core/pkg/serverless/hostfunctions/types.go +++ b/core/pkg/serverless/hostfunctions/types.go @@ -97,6 +97,13 @@ type HostFunctions struct { triggerDispatcher *triggers.PubSubDispatcher triggerDispatcherLock sync.RWMutex + // ephemeralStore backs ephemeral_state_set / ephemeral_state_clear + // (bugboard #710). Constructed in NewHostFunctions when a WS manager is + // present; nil otherwise (host fns then return an error). The store + // registers a disconnect hook on the WS manager so a client's owned state + // auto-clears the instant its WebSocket disconnects. + ephemeralStore *serverless.EphemeralStore + // Current invocation context (set per-execution) invCtx *serverless.InvocationContext invCtxLock sync.RWMutex diff --git a/core/pkg/serverless/invoke.go b/core/pkg/serverless/invoke.go index 84d0b82..d101d84 100644 --- a/core/pkg/serverless/invoke.go +++ b/core/pkg/serverless/invoke.go @@ -75,6 +75,13 @@ type InvokeResponse struct { Error string `json:"error,omitempty"` DurationMS int64 `json:"duration_ms"` Retries int `json:"retries,omitempty"` + + // RawHTTP carries a verbatim HTTP response set by a RawHTTPResponse + // function via set_http_response (bugboard #835). nil for normal + // functions and for raw functions that never called set_http_response — + // the HTTP handler falls back to the standard JSON/Ack path in that case. + // Not serialized; consumed directly by the HTTP invoke handler. + RawHTTP *RawHTTPResult `json:"-"` } // Invoke executes a function with automatic retry logic. @@ -169,6 +176,8 @@ func (i *Invoker) Invoke(ctx context.Context, req *InvokeRequest) (*InvokeRespon } response.Status = InvocationStatusSuccess + // Surface any verbatim HTTP response the function set (bugboard #835). + response.RawHTTP = invCtx.RawHTTP return response, nil } diff --git a/core/pkg/serverless/mocks_test.go b/core/pkg/serverless/mocks_test.go index 60ad60f..0240dc0 100644 --- a/core/pkg/serverless/mocks_test.go +++ b/core/pkg/serverless/mocks_test.go @@ -247,6 +247,18 @@ func (m *MockHostServices) WSPubSubUnbridge(ctx context.Context, clientID, topic return nil } +func (m *MockHostServices) SetHTTPResponse(ctx context.Context, status int, headers map[string]string, body []byte) error { + return SetRawHTTPResponse(ctx, status, headers, body) +} + +func (m *MockHostServices) EphemeralStateSet(ctx context.Context, topic, key string, payload []byte, ttlMs int64) error { + return nil +} + +func (m *MockHostServices) EphemeralStateClear(ctx context.Context, topic, key string) error { + return nil +} + func (m *MockHostServices) WSSend(ctx context.Context, clientID string, data []byte) error { return nil } diff --git a/core/pkg/serverless/raw_http.go b/core/pkg/serverless/raw_http.go new file mode 100644 index 0000000..b213b1e --- /dev/null +++ b/core/pkg/serverless/raw_http.go @@ -0,0 +1,142 @@ +package serverless + +import ( + "context" + "fmt" + "sync" +) + +// Raw-HTTP-response mode (bugboard #835). +// +// A function deployed with RawHTTPResponse=true can emit a verbatim HTTP +// response (status + headers + body) instead of the JSON/Ack-wrapped output +// the stateless invoke handler normally produces. This lets a namespace app +// proxy an upstream RPC (Helius / Alchemy) transparently — the function reads +// the request, calls the upstream, and replays the upstream's status, headers, +// and body byte-for-byte back to its own caller. +// +// The primitive provided here is ONLY the response carrier + the host-call +// validation. Per-user-JWT quota gating (which the ticket mentions) is the +// APP's responsibility: the function can call oh.GetCallerJwtSubject() and +// decide whether to serve. The gateway does not implement quota here. + +const ( + // rawHTTPMaxHeaders caps how many response headers a function may set. + // Generous for a proxy use-case (upstream RPCs return well under this) + // while bounding the per-invocation allocation a hostile function could + // force. + rawHTTPMaxHeaders = 64 + + // rawHTTPMaxBodyBytes caps the verbatim response body a function may set. + // 8 MiB comfortably covers JSON-RPC responses (even large getBlock / + // getProgramAccounts payloads) without letting a function buffer an + // unbounded body in gateway memory. + rawHTTPMaxBodyBytes = 8 << 20 + + // rawHTTPMinStatus / rawHTTPMaxStatus bound a valid HTTP status code. + rawHTTPMinStatus = 100 + rawHTTPMaxStatus = 599 +) + +// RawHTTPResult is a verbatim HTTP response set by a RawHTTPResponse function. +// Set is true once the function has called set_http_response at least once; +// the invoke handler only takes the raw path when Set is true (otherwise it +// falls back to the normal JSON/Ack-wrapped behavior). +type RawHTTPResult struct { + Status int + Headers map[string]string + Body []byte + Set bool +} + +// rawHTTPCollector is the mutable per-invocation sink the set_http_response +// host function writes to. It rides the invocation's context (same per-call +// propagation model as the publish counter and log buffer) so concurrent +// invocations never cross-write each other's response. +type rawHTTPCollector struct { + mu sync.Mutex + result RawHTTPResult +} + +// rawHTTPKey is the unexported context-value key for the raw-HTTP collector. +type rawHTTPKey struct{} + +// WithRawHTTPCollector returns a derived ctx carrying a FRESH per-invocation +// raw-HTTP response collector. The engine attaches this before executing a +// RawHTTPResponse function so the set_http_response host call has somewhere to +// write; for non-raw functions the collector is absent and the host call is a +// validated no-op. +func WithRawHTTPCollector(ctx context.Context) context.Context { + return context.WithValue(ctx, rawHTTPKey{}, &rawHTTPCollector{}) +} + +// rawHTTPCollectorFromCtx extracts the collector attached via +// WithRawHTTPCollector, or nil if none is present (non-raw function, or an +// untracked code path). +func rawHTTPCollectorFromCtx(ctx context.Context) *rawHTTPCollector { + if ctx == nil { + return nil + } + c, _ := ctx.Value(rawHTTPKey{}).(*rawHTTPCollector) + return c +} + +// SetRawHTTPResponse records a verbatim HTTP response on the invocation's +// collector. Returns an error if no collector is attached (the function was +// not deployed with RawHTTPResponse), or if the status / header count / body +// size fail validation. Headers may be nil. The body is copied so the caller +// (which reads it out of guest WASM memory) may reuse its buffer. +func SetRawHTTPResponse(ctx context.Context, status int, headers map[string]string, body []byte) error { + c := rawHTTPCollectorFromCtx(ctx) + if c == nil { + return fmt.Errorf("set_http_response: function is not deployed with raw_http_response enabled") + } + if status < rawHTTPMinStatus || status > rawHTTPMaxStatus { + return fmt.Errorf("set_http_response: status %d out of range [%d,%d]", status, rawHTTPMinStatus, rawHTTPMaxStatus) + } + if len(headers) > rawHTTPMaxHeaders { + return fmt.Errorf("set_http_response: too many headers (%d > %d)", len(headers), rawHTTPMaxHeaders) + } + if len(body) > rawHTTPMaxBodyBytes { + return fmt.Errorf("set_http_response: body too large (%d bytes > %d)", len(body), rawHTTPMaxBodyBytes) + } + + bodyCopy := make([]byte, len(body)) + copy(bodyCopy, body) + + var hdrCopy map[string]string + if len(headers) > 0 { + hdrCopy = make(map[string]string, len(headers)) + for k, v := range headers { + hdrCopy[k] = v + } + } + + c.mu.Lock() + c.result = RawHTTPResult{ + Status: status, + Headers: hdrCopy, + Body: bodyCopy, + Set: true, + } + c.mu.Unlock() + return nil +} + +// TakeRawHTTPResponse returns the raw HTTP response recorded on the ctx's +// collector and whether one was set. Returns (zero, false) when no collector +// is attached or the function never called set_http_response. The engine calls +// this after Execute to surface the response on the InvokeResponse. +func TakeRawHTTPResponse(ctx context.Context) (RawHTTPResult, bool) { + c := rawHTTPCollectorFromCtx(ctx) + if c == nil { + return RawHTTPResult{}, false + } + c.mu.Lock() + res := c.result + c.mu.Unlock() + if !res.Set { + return RawHTTPResult{}, false + } + return res, true +} diff --git a/core/pkg/serverless/raw_http_test.go b/core/pkg/serverless/raw_http_test.go new file mode 100644 index 0000000..f600ea3 --- /dev/null +++ b/core/pkg/serverless/raw_http_test.go @@ -0,0 +1,129 @@ +package serverless + +import ( + "bytes" + "context" + "strings" + "testing" +) + +func TestSetRawHTTPResponse_happyPath(t *testing.T) { + ctx := WithRawHTTPCollector(context.Background()) + + headers := map[string]string{"Content-Type": "application/json"} + body := []byte(`{"jsonrpc":"2.0","result":42}`) + if err := SetRawHTTPResponse(ctx, 200, headers, body); err != nil { + t.Fatalf("SetRawHTTPResponse: unexpected error: %v", err) + } + + res, ok := TakeRawHTTPResponse(ctx) + if !ok { + t.Fatal("TakeRawHTTPResponse: expected a response to be set") + } + if res.Status != 200 { + t.Errorf("status = %d, want 200", res.Status) + } + if res.Headers["Content-Type"] != "application/json" { + t.Errorf("Content-Type header = %q, want application/json", res.Headers["Content-Type"]) + } + if !bytes.Equal(res.Body, body) { + t.Errorf("body = %q, want %q", res.Body, body) + } +} + +func TestSetRawHTTPResponse_copiesBodyAndHeaders(t *testing.T) { + ctx := WithRawHTTPCollector(context.Background()) + + headers := map[string]string{"X-Test": "v1"} + body := []byte("original") + if err := SetRawHTTPResponse(ctx, 200, headers, body); err != nil { + t.Fatalf("SetRawHTTPResponse: %v", err) + } + + // Mutate caller-owned buffers AFTER the call — the stored copy must not change. + body[0] = 'X' + headers["X-Test"] = "mutated" + + res, _ := TakeRawHTTPResponse(ctx) + if string(res.Body) != "original" { + t.Errorf("body was not copied: got %q", res.Body) + } + if res.Headers["X-Test"] != "v1" { + t.Errorf("headers were not copied: got %q", res.Headers["X-Test"]) + } +} + +func TestSetRawHTTPResponse_noCollector(t *testing.T) { + // No collector attached → the function is not in raw mode; must error. + err := SetRawHTTPResponse(context.Background(), 200, nil, []byte("x")) + if err == nil { + t.Fatal("expected error when no collector is attached") + } + if !strings.Contains(err.Error(), "raw_http_response") { + t.Errorf("error = %q, want it to mention raw_http_response", err.Error()) + } +} + +func TestSetRawHTTPResponse_rejectsBadStatus(t *testing.T) { + for _, status := range []int{0, 99, 600, 1000, -1} { + ctx := WithRawHTTPCollector(context.Background()) + if err := SetRawHTTPResponse(ctx, status, nil, nil); err == nil { + t.Errorf("status %d: expected validation error, got nil", status) + } + if _, ok := TakeRawHTTPResponse(ctx); ok { + t.Errorf("status %d: response should not be set after a rejected status", status) + } + } +} + +func TestSetRawHTTPResponse_rejectsTooManyHeaders(t *testing.T) { + ctx := WithRawHTTPCollector(context.Background()) + headers := make(map[string]string, rawHTTPMaxHeaders+1) + for i := 0; i <= rawHTTPMaxHeaders; i++ { + headers["h"+string(rune('a'+i%26))+string(rune('0'+i/26))] = "v" + } + if len(headers) <= rawHTTPMaxHeaders { + t.Fatalf("test setup: expected > %d headers, got %d", rawHTTPMaxHeaders, len(headers)) + } + if err := SetRawHTTPResponse(ctx, 200, headers, nil); err == nil { + t.Fatal("expected error for too many headers") + } +} + +func TestSetRawHTTPResponse_rejectsOversizedBody(t *testing.T) { + ctx := WithRawHTTPCollector(context.Background()) + body := make([]byte, rawHTTPMaxBodyBytes+1) + if err := SetRawHTTPResponse(ctx, 200, nil, body); err == nil { + t.Fatal("expected error for oversized body") + } +} + +func TestTakeRawHTTPResponse_notSet(t *testing.T) { + // Collector attached but set_http_response never called → (zero, false). + ctx := WithRawHTTPCollector(context.Background()) + if _, ok := TakeRawHTTPResponse(ctx); ok { + t.Fatal("expected ok=false when no response was set") + } + + // No collector at all → also (zero, false). + if _, ok := TakeRawHTTPResponse(context.Background()); ok { + t.Fatal("expected ok=false with no collector") + } +} + +func TestSetRawHTTPResponse_lastWriteWins(t *testing.T) { + ctx := WithRawHTTPCollector(context.Background()) + if err := SetRawHTTPResponse(ctx, 200, nil, []byte("first")); err != nil { + t.Fatalf("first SetRawHTTPResponse: %v", err) + } + if err := SetRawHTTPResponse(ctx, 503, map[string]string{"Retry-After": "5"}, []byte("second")); err != nil { + t.Fatalf("second SetRawHTTPResponse: %v", err) + } + res, ok := TakeRawHTTPResponse(ctx) + if !ok { + t.Fatal("expected response to be set") + } + if res.Status != 503 || string(res.Body) != "second" || res.Headers["Retry-After"] != "5" { + t.Errorf("last-write-wins failed: got status=%d body=%q headers=%v", res.Status, res.Body, res.Headers) + } +} diff --git a/core/pkg/serverless/registry.go b/core/pkg/serverless/registry.go index fe28c84..db8d0b5 100644 --- a/core/pkg/serverless/registry.go +++ b/core/pkg/serverless/registry.go @@ -107,8 +107,9 @@ func (r *Registry) Register(ctx context.Context, fn *FunctionDefinition, wasmByt memory_limit_mb, timeout_seconds, is_public, retry_count, retry_delay_seconds, dlq_topic, status, created_at, updated_at, created_by, - ws_persistent, ws_idle_timeout_sec, ws_max_frame_bytes, ws_max_inflight_per_conn - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + ws_persistent, ws_idle_timeout_sec, ws_max_frame_bytes, ws_max_inflight_per_conn, + raw_http_response + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ` _, err = r.db.Exec(ctx, query, id, fn.Name, fn.Namespace, version, wasmCID, @@ -116,6 +117,7 @@ func (r *Registry) Register(ctx context.Context, fn *FunctionDefinition, wasmByt fn.RetryCount, retryDelay, fn.DLQTopic, string(FunctionStatusActive), now, now, fn.Namespace, fn.WSPersistent, fn.WSIdleTimeoutSec, fn.WSMaxFrameBytes, fn.WSMaxInflightPerConn, + fn.RawHTTPResponse, ) if err != nil { return nil, &DeployError{FunctionName: fn.Name, Cause: fmt.Errorf("failed to register function: %w", err)} @@ -154,7 +156,8 @@ func (r *Registry) Get(ctx context.Context, namespace, name string, version int) memory_limit_mb, timeout_seconds, is_public, retry_count, retry_delay_seconds, dlq_topic, status, created_at, updated_at, created_by, - ws_persistent, ws_idle_timeout_sec, ws_max_frame_bytes, ws_max_inflight_per_conn + ws_persistent, ws_idle_timeout_sec, ws_max_frame_bytes, ws_max_inflight_per_conn, + raw_http_response FROM functions WHERE namespace = ? AND name = ? AND status = ? ORDER BY version DESC @@ -167,7 +170,8 @@ func (r *Registry) Get(ctx context.Context, namespace, name string, version int) memory_limit_mb, timeout_seconds, is_public, retry_count, retry_delay_seconds, dlq_topic, status, created_at, updated_at, created_by, - ws_persistent, ws_idle_timeout_sec, ws_max_frame_bytes, ws_max_inflight_per_conn + ws_persistent, ws_idle_timeout_sec, ws_max_frame_bytes, ws_max_inflight_per_conn, + raw_http_response FROM functions WHERE namespace = ? AND name = ? AND version = ? ` @@ -197,7 +201,8 @@ func (r *Registry) List(ctx context.Context, namespace string) ([]*Function, err f.memory_limit_mb, f.timeout_seconds, f.is_public, f.retry_count, f.retry_delay_seconds, f.dlq_topic, f.status, f.created_at, f.updated_at, f.created_by, - f.ws_persistent, f.ws_idle_timeout_sec, f.ws_max_frame_bytes, f.ws_max_inflight_per_conn + f.ws_persistent, f.ws_idle_timeout_sec, f.ws_max_frame_bytes, f.ws_max_inflight_per_conn, + f.raw_http_response FROM functions f INNER JOIN ( SELECT namespace, name, MAX(version) as max_version @@ -668,6 +673,11 @@ func (r *Registry) rowToFunction(row *functionRow) *Function { WSIdleTimeoutSec: row.WSIdleTimeoutSec, WSMaxFrameBytes: row.WSMaxFrameBytes, WSMaxInflightPerConn: row.WSMaxInflightPerConn, + + // Raw-HTTP-response mode (bugboard #835). Without reading this back + // the invoke handler's `if fn.RawHTTPResponse` engine branch never + // fires and set_http_response is a no-op for every function. + RawHTTPResponse: row.RawHTTPResponse, } } @@ -716,6 +726,11 @@ type functionRow struct { WSIdleTimeoutSec int `db:"ws_idle_timeout_sec"` WSMaxFrameBytes int `db:"ws_max_frame_bytes"` WSMaxInflightPerConn int `db:"ws_max_inflight_per_conn"` + + // Raw-HTTP-response mode (bugboard #835). Backed by migration + // 029_raw_http_response.sql; defaults to false so existing functions + // keep the JSON/Ack-wrapped behavior. + RawHTTPResponse bool `db:"raw_http_response"` } type envVarRow struct { diff --git a/core/pkg/serverless/registry/function_store.go b/core/pkg/serverless/registry/function_store.go index 1bfdcac..ff54a6e 100644 --- a/core/pkg/serverless/registry/function_store.go +++ b/core/pkg/serverless/registry/function_store.go @@ -57,8 +57,9 @@ func (s *FunctionStore) Save(ctx context.Context, fn *FunctionDefinition, wasmCI memory_limit_mb, timeout_seconds, is_public, retry_count, retry_delay_seconds, dlq_topic, status, created_at, updated_at, created_by, - ws_persistent, ws_idle_timeout_sec, ws_max_frame_bytes, ws_max_inflight_per_conn - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + ws_persistent, ws_idle_timeout_sec, ws_max_frame_bytes, ws_max_inflight_per_conn, + raw_http_response + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ` _, err := s.db.Exec(ctx, query, id, fn.Name, fn.Namespace, version, wasmCID, @@ -66,6 +67,7 @@ func (s *FunctionStore) Save(ctx context.Context, fn *FunctionDefinition, wasmCI fn.RetryCount, retryDelay, fn.DLQTopic, string(FunctionStatusActive), now, now, fn.Namespace, fn.WSPersistent, fn.WSIdleTimeoutSec, fn.WSMaxFrameBytes, fn.WSMaxInflightPerConn, + fn.RawHTTPResponse, ) if err != nil { return nil, fmt.Errorf("failed to save function: %w", err) @@ -101,6 +103,7 @@ func (s *FunctionStore) Save(ctx context.Context, fn *FunctionDefinition, wasmCI WSIdleTimeoutSec: fn.WSIdleTimeoutSec, WSMaxFrameBytes: fn.WSMaxFrameBytes, WSMaxInflightPerConn: fn.WSMaxInflightPerConn, + RawHTTPResponse: fn.RawHTTPResponse, }, nil } @@ -114,7 +117,7 @@ func (s *FunctionStore) Get(ctx context.Context, namespace, name string, version if version == 0 { query = ` - SELECT id, name, namespace, version, wasm_cid, source_cid, ws_persistent, ws_idle_timeout_sec, ws_max_frame_bytes, ws_max_inflight_per_conn, + SELECT id, name, namespace, version, wasm_cid, source_cid, ws_persistent, ws_idle_timeout_sec, ws_max_frame_bytes, ws_max_inflight_per_conn, raw_http_response, memory_limit_mb, timeout_seconds, is_public, retry_count, retry_delay_seconds, dlq_topic, status, created_at, updated_at, created_by @@ -126,7 +129,7 @@ func (s *FunctionStore) Get(ctx context.Context, namespace, name string, version args = []interface{}{namespace, name, string(FunctionStatusActive)} } else { query = ` - SELECT id, name, namespace, version, wasm_cid, source_cid, ws_persistent, ws_idle_timeout_sec, ws_max_frame_bytes, ws_max_inflight_per_conn, + SELECT id, name, namespace, version, wasm_cid, source_cid, ws_persistent, ws_idle_timeout_sec, ws_max_frame_bytes, ws_max_inflight_per_conn, raw_http_response, memory_limit_mb, timeout_seconds, is_public, retry_count, retry_delay_seconds, dlq_topic, status, created_at, updated_at, created_by @@ -154,7 +157,7 @@ func (s *FunctionStore) Get(ctx context.Context, namespace, name string, version // GetByID retrieves a function by its ID. func (s *FunctionStore) GetByID(ctx context.Context, id string) (*Function, error) { query := ` - SELECT id, name, namespace, version, wasm_cid, source_cid, ws_persistent, ws_idle_timeout_sec, ws_max_frame_bytes, ws_max_inflight_per_conn, + SELECT id, name, namespace, version, wasm_cid, source_cid, ws_persistent, ws_idle_timeout_sec, ws_max_frame_bytes, ws_max_inflight_per_conn, raw_http_response, memory_limit_mb, timeout_seconds, is_public, retry_count, retry_delay_seconds, dlq_topic, status, created_at, updated_at, created_by @@ -180,7 +183,7 @@ func (s *FunctionStore) GetByNameInternal(ctx context.Context, namespace, name s name = strings.TrimSpace(name) query := ` - SELECT id, name, namespace, version, wasm_cid, source_cid, ws_persistent, ws_idle_timeout_sec, ws_max_frame_bytes, ws_max_inflight_per_conn, + SELECT id, name, namespace, version, wasm_cid, source_cid, ws_persistent, ws_idle_timeout_sec, ws_max_frame_bytes, ws_max_inflight_per_conn, raw_http_response, memory_limit_mb, timeout_seconds, is_public, retry_count, retry_delay_seconds, dlq_topic, status, created_at, updated_at, created_by @@ -207,6 +210,7 @@ func (s *FunctionStore) List(ctx context.Context, namespace string) ([]*Function query := ` SELECT f.id, f.name, f.namespace, f.version, f.wasm_cid, f.source_cid, f.ws_persistent, f.ws_idle_timeout_sec, f.ws_max_frame_bytes, f.ws_max_inflight_per_conn, + f.raw_http_response, f.memory_limit_mb, f.timeout_seconds, f.is_public, f.retry_count, f.retry_delay_seconds, f.dlq_topic, f.status, f.created_at, f.updated_at, f.created_by @@ -238,7 +242,7 @@ func (s *FunctionStore) List(ctx context.Context, namespace string) ([]*Function // ListVersions returns all versions of a function. func (s *FunctionStore) ListVersions(ctx context.Context, namespace, name string) ([]*Function, error) { query := ` - SELECT id, name, namespace, version, wasm_cid, source_cid, ws_persistent, ws_idle_timeout_sec, ws_max_frame_bytes, ws_max_inflight_per_conn, + SELECT id, name, namespace, version, wasm_cid, source_cid, ws_persistent, ws_idle_timeout_sec, ws_max_frame_bytes, ws_max_inflight_per_conn, raw_http_response, memory_limit_mb, timeout_seconds, is_public, retry_count, retry_delay_seconds, dlq_topic, status, created_at, updated_at, created_by @@ -399,5 +403,6 @@ func rowToFunction(row *functionRow) *Function { WSIdleTimeoutSec: row.WSIdleTimeoutSec, WSMaxFrameBytes: row.WSMaxFrameBytes, WSMaxInflightPerConn: row.WSMaxInflightPerConn, + RawHTTPResponse: row.RawHTTPResponse, } } diff --git a/core/pkg/serverless/registry/types.go b/core/pkg/serverless/registry/types.go index 0100211..383f366 100644 --- a/core/pkg/serverless/registry/types.go +++ b/core/pkg/serverless/registry/types.go @@ -38,6 +38,9 @@ type FunctionDefinition struct { WSIdleTimeoutSec int WSMaxFrameBytes int WSMaxInflightPerConn int + + // RawHTTPResponse enables raw-HTTP-response mode (bugboard #835). + RawHTTPResponse bool } // Function represents a deployed serverless function. @@ -64,6 +67,9 @@ type Function struct { WSIdleTimeoutSec int WSMaxFrameBytes int WSMaxInflightPerConn int + + // RawHTTPResponse enables raw-HTTP-response mode (bugboard #835). + RawHTTPResponse bool } // LogEntry represents a log message emitted from inside a WASM function @@ -180,6 +186,7 @@ type functionRow struct { WSIdleTimeoutSec int WSMaxFrameBytes int WSMaxInflightPerConn int + RawHTTPResponse bool } type envVarRow struct { diff --git a/core/pkg/serverless/registry_raw_http_test.go b/core/pkg/serverless/registry_raw_http_test.go new file mode 100644 index 0000000..c544a73 --- /dev/null +++ b/core/pkg/serverless/registry_raw_http_test.go @@ -0,0 +1,34 @@ +package serverless + +import ( + "strings" + "testing" +) + +// TestRegistryRowMapping_IncludesRawHTTPResponse guards the raw-HTTP-response +// column (bugboard #835): rowToFunction must copy raw_http_response off the DB +// row, otherwise the engine's `if fn.RawHTTPResponse` branch never attaches a +// collector and set_http_response is a permanent no-op for every function. +func TestRegistryRowMapping_IncludesRawHTTPResponse(t *testing.T) { + row := functionRow{RawHTTPResponse: true} + r := &Registry{} + fn := r.rowToFunction(&row) + if !fn.RawHTTPResponse { + t.Error("rowToFunction did not propagate RawHTTPResponse — raw-HTTP functions would silently fall back to JSON/Ack output (bugboard #835)") + } +} + +// TestRegistry_QueriesRawHTTPResponseColumn is the SQL-text drift guard: the +// raw_http_response column must appear in the INSERT plus every READ-path +// SELECT, mirroring the ws_* column guard. Counted ≥5 (one INSERT + the +// Get/GetByID/List/ListVersions/getByNameInternal SELECTs). +func TestRegistry_QueriesRawHTTPResponseColumn(t *testing.T) { + source, err := readRegistrySource() + if err != nil { + t.Skipf("cannot read registry.go for SQL inspection: %v", err) + } + count := strings.Count(source, "raw_http_response") + if count < 5 { + t.Errorf("column raw_http_response appears in registry.go only %d times; expected ≥5 (INSERT + each SELECT path). A READ path probably dropped it and raw-HTTP functions will silently fall back to JSON output.", count) + } +} diff --git a/core/pkg/serverless/triggers/dispatch_local_dedup_integration_test.go b/core/pkg/serverless/triggers/dispatch_local_dedup_integration_test.go new file mode 100644 index 0000000..0606a51 --- /dev/null +++ b/core/pkg/serverless/triggers/dispatch_local_dedup_integration_test.go @@ -0,0 +1,159 @@ +package triggers + +import ( + "context" + "fmt" + "testing" + + olriclib "github.com/olric-data/olric" + "github.com/olric-data/olric/stats" + "go.uber.org/zap" + "go.uber.org/zap/zapcore" + "go.uber.org/zap/zaptest/observer" +) + +// failingOlricClient is a minimal olric.Client whose NewDMap always errors, +// simulating an Olric backend that is configured but unavailable — the +// degraded path bugboard #555 must surface (fail-open + rate-limited WARN). +type failingOlricClient struct{} + +func (failingOlricClient) NewDMap(string, ...olriclib.DMapOption) (olriclib.DMap, error) { + return nil, fmt.Errorf("olric unavailable (test)") +} +func (failingOlricClient) NewPubSub(...olriclib.PubSubOption) (*olriclib.PubSub, error) { + return nil, fmt.Errorf("not implemented") +} +func (failingOlricClient) Stats(context.Context, string, ...olriclib.StatsOption) (stats.Stats, error) { + return stats.Stats{}, fmt.Errorf("not implemented") +} +func (failingOlricClient) Ping(context.Context, string, string) (string, error) { + return "", fmt.Errorf("not implemented") +} +func (failingOlricClient) RoutingTable(context.Context) (olriclib.RoutingTable, error) { + return nil, fmt.Errorf("not implemented") +} +func (failingOlricClient) Members(context.Context) ([]olriclib.Member, error) { + return nil, fmt.Errorf("not implemented") +} +func (failingOlricClient) RefreshMetadata(context.Context) error { return nil } +func (failingOlricClient) Close(context.Context) error { return nil } + +var _ olriclib.Client = failingOlricClient{} + +// Bugboard #555 — duplicate push from the dispatcher firing twice. +// +// These exercise Dispatch's local-dedup short-circuit and the +// degraded-dedup WARN. They use a nil-db store: getMatches would panic on +// the nil rqlite.Client, so "did we reach getMatches?" is observable as +// "did Dispatch panic?". The local dedup runs BEFORE getMatches, so a +// deduped call must return cleanly without touching the store. + +func TestDispatch_localDedupSkipsSecondInvokeSameNode(t *testing.T) { + logger := zap.NewNop() + store := NewPubSubTriggerStore(nil, logger) // nil db: getMatches panics if reached + d := NewPubSubDispatcher(store, nil, nil, nil, logger) + + ns, topic, data := "anchat", "messages:new", []byte(`{"messageId":"m1"}`) + + // First publish: NOT deduped → reaches getMatches → nil-db panic. We + // recover and confirm we got past the dedup gate. + reachedStore := false + func() { + defer func() { + if recover() != nil { + reachedStore = true + } + }() + d.Dispatch(context.Background(), ns, topic, data, 0) + }() + if !reachedStore { + t.Fatal("first publish must pass the dedup gate and reach the store lookup") + } + + // Second IDENTICAL publish within the TTL: MUST be deduped locally and + // return BEFORE getMatches — so no panic this time. + dedupedClean := true + func() { + defer func() { + if recover() != nil { + dedupedClean = false + } + }() + d.Dispatch(context.Background(), ns, topic, data, 0) + }() + if !dedupedClean { + t.Error("BUG #555 REGRESSION: identical second publish on the same node " + + "must be deduped locally and NOT re-dispatch") + } +} + +func TestDispatch_distinctPayloadsBothDispatch(t *testing.T) { + logger := zap.NewNop() + store := NewPubSubTriggerStore(nil, logger) + d := NewPubSubDispatcher(store, nil, nil, nil, logger) + + ns, topic := "anchat", "messages:new" + + for _, data := range [][]byte{[]byte(`{"messageId":"a"}`), []byte(`{"messageId":"b"}`)} { + reachedStore := false + func() { + defer func() { + if recover() != nil { + reachedStore = true + } + }() + d.Dispatch(context.Background(), ns, topic, data, 0) + }() + if !reachedStore { + t.Errorf("distinct payload %q must NOT be deduped — it must reach dispatch", data) + } + } +} + +func TestClaimDispatch_degradedWarnWhenOlricDown(t *testing.T) { + // Olric "configured but failing" path: a non-nil client whose NewDMap + // errors. claimDispatch must STILL fire (fail-open) AND emit a WARN so + // operators can see cross-node dedup is degraded. + core, observed := observer.New(zapcore.WarnLevel) + d := &PubSubDispatcher{ + logger: zap.New(core), + olricClient: failingOlricClient{}, + } + + if !d.claimDispatch(context.Background(), "ns", "messages:new", []byte("x")) { + t.Fatal("claimDispatch must fail-open (true) when Olric is degraded — never drop the wake") + } + if observed.FilterMessageSnippet("dedup degraded").Len() == 0 { + t.Error("degraded Olric path must emit a WARN naming the degradation, not stay silent") + } +} + +func TestClaimDispatch_degradedWarnRateLimited(t *testing.T) { + // A sustained outage must NOT flood the log: only one WARN per interval. + core, observed := observer.New(zapcore.WarnLevel) + d := &PubSubDispatcher{ + logger: zap.New(core), + olricClient: failingOlricClient{}, + } + + for i := 0; i < 5; i++ { + d.claimDispatch(context.Background(), "ns", "messages:new", []byte("x")) + } + if got := observed.FilterMessageSnippet("dedup degraded").Len(); got != 1 { + t.Errorf("degraded WARN must be rate-limited to 1 per interval; got %d", got) + } +} + +func TestClaimDispatch_nilOlricStaysQuiet(t *testing.T) { + // nil Olric is a NORMAL single-node / cache-disabled config, not a + // degraded multi-node cluster. It must fire but NOT warn (avoid noise). + core, observed := observer.New(zapcore.WarnLevel) + d := &PubSubDispatcher{logger: zap.New(core)} // olricClient nil + + if !d.claimDispatch(context.Background(), "ns", "messages:new", []byte("x")) { + t.Fatal("nil Olric must fail-open (true)") + } + if observed.Len() != 0 { + t.Errorf("nil Olric is a normal config and must NOT emit a degraded WARN; got %d logs", observed.Len()) + } +} diff --git a/core/pkg/serverless/triggers/dispatcher.go b/core/pkg/serverless/triggers/dispatcher.go index dcff17f..91f7ef6 100644 --- a/core/pkg/serverless/triggers/dispatcher.go +++ b/core/pkg/serverless/triggers/dispatcher.go @@ -134,8 +134,24 @@ type PubSubDispatcher struct { // stopCh signals the periodic Refresh goroutine to exit. stopCh chan struct{} stopOnce sync.Once + + // localDedup guards against a SINGLE node invoking the same publish + // twice (e.g. gossipsub self-delivery), independent of Olric health. + // Bugboard #555. Always non-nil after NewPubSubDispatcher. + localDedup *localDedupCache + + // degradedDedupWarn rate-limits the "Olric dedup degraded" WARN so a + // misconfigured cluster doesn't flood the log on every publish. + // Bugboard #555. + degradedDedupMu sync.Mutex + degradedDedupLastWarn time.Time } +// degradedDedupWarnInterval rate-limits the cross-node dedup-degraded WARN +// (bugboard #555). One warning per interval is enough to alert operators +// without flooding the log under high publish volume. +const degradedDedupWarnInterval = 60 * time.Second + // NewPubSubDispatcher creates a new PubSub trigger dispatcher. // // The `ps` argument may be nil (e.g. in tests, or namespaces with pubsub @@ -158,6 +174,7 @@ func NewPubSubDispatcher( logger: logger, subscribedKeys: make(map[string]bool), stopCh: make(chan struct{}), + localDedup: newLocalDedupCache(), } } @@ -337,6 +354,20 @@ func (d *PubSubDispatcher) Dispatch(ctx context.Context, namespace, topic string return } + // Local once-per-publish dedup (bugboard #555). gossipsub can deliver + // the SAME publish to this node's subscribe handler more than once + // (self-delivery / fan-out), and the cross-node Olric claim below is a + // no-op when Olric is down. This in-process guard ensures a SINGLE node + // never invokes the same (namespace, topic, payload) twice, regardless + // of Olric health. + dedupKey := dispatchDedupKey(namespace, topic, data) + if !d.localDedup.claim(dedupKey) { + d.logger.Debug("PubSub dispatch deduped (local duplicate on this node)", + zap.String("namespace", namespace), + zap.String("topic", topic)) + return + } + // Cluster-wide once-per-publish dedup (bugboard #30). gossipsub // delivers a publish to every subscribed gateway node; only the node // that wins the Olric claim for this (namespace, topic, payload) @@ -580,7 +611,7 @@ func (d *PubSubDispatcher) claimDispatch(ctx context.Context, namespace, topic s } dm, err := d.olricClient.NewDMap(dispatchDedupDMap) if err != nil { - d.logger.Debug("dispatch dedup: NewDMap failed, firing (fail-open)", zap.Error(err)) + d.warnDedupDegraded("NewDMap failed", namespace, topic, err) return true } key := dispatchDedupKey(namespace, topic, data) @@ -594,11 +625,39 @@ func (d *PubSubDispatcher) claimDispatch(ctx context.Context, namespace, topic s // Any other (transient) error: fail-open and fire rather than risk a // dropped wake. Worst case is a duplicate, which is what #30 already // had — never worse. - d.logger.Debug("dispatch dedup: claim errored, firing (fail-open)", - zap.String("topic", topic), zap.Error(err)) + d.warnDedupDegraded("claim Put errored", namespace, topic, err) return true } +// warnDedupDegraded emits a rate-limited WARN announcing that cross-node +// dispatch dedup is degraded (Olric unavailable), so the cluster has fallen +// back to firing on every node that receives the publish. The local cache +// still prevents same-node duplicates, but cross-node duplicate pushes are +// possible until Olric recovers — operators need visibility, not silence +// (bugboard #555). Rate-limited so a sustained outage doesn't flood logs. +func (d *PubSubDispatcher) warnDedupDegraded(reason, namespace, topic string, err error) { + d.degradedDedupMu.Lock() + now := time.Now() + shouldWarn := now.Sub(d.degradedDedupLastWarn) >= degradedDedupWarnInterval + if shouldWarn { + d.degradedDedupLastWarn = now + } + d.degradedDedupMu.Unlock() + + if !shouldWarn { + return + } + d.logger.Warn("PubSub dispatch dedup degraded: Olric unavailable, "+ + "falling back to fire-on-every-node — cross-node duplicate pushes "+ + "possible until the shared store recovers", + zap.String("reason", reason), + zap.String("namespace", namespace), + zap.String("topic", topic), + zap.Duration("warn_interval", degradedDedupWarnInterval), + zap.Error(err), + ) +} + // InvalidateCache is now a no-op — the dispatcher no longer caches lookups. // Kept on the type so callers who used it still compile. func (d *PubSubDispatcher) InvalidateCache(ctx context.Context, namespace, topic string) {} diff --git a/core/pkg/serverless/triggers/local_dedup.go b/core/pkg/serverless/triggers/local_dedup.go new file mode 100644 index 0000000..c6c48e4 --- /dev/null +++ b/core/pkg/serverless/triggers/local_dedup.go @@ -0,0 +1,108 @@ +package triggers + +import ( + "sync" + "time" +) + +// Bugboard #555 — messages:new trigger fires twice (duplicate push). +// +// Two distinct bugs produced duplicate dispatches: +// +// 1. Cross-node fail-open: claimDispatch (dispatcher.go) coordinates +// once-per-publish dispatch via Olric, but FAILS OPEN when Olric is +// unavailable/misconfigured. On a multi-node cluster every node that +// receives the gossip publish then fires the handler → N duplicate +// invocations (AnChat: exactly 2 on a 2-reachable-node cluster). +// +// 2. Single-node self-delivery: even on one node, gossipsub can deliver a +// locally-originated publish back to the same node's subscribe handler, +// and the only guard was the cross-node Olric claim — which is a no-op +// when Olric is down. +// +// localDedupCache fixes (2) and bounds the blast radius of (1): a single +// node never invokes the SAME publish twice, regardless of Olric health. +// It is a small bounded map with per-entry TTL, keyed by the SAME string +// dispatchDedupKey produces — (namespace, topic, sha256(payload)[:16]). +// +// IDENTICAL-PAYLOAD CAVEAT: the key folds the payload hash, NOT a stable +// message id (gossipsub's message-ID isn't plumbed through the subscribe +// handler, and parsing an app-specific id would couple the dispatcher to a +// tenant's JSON schema). So two byte-identical publishes within the TTL +// window collapse to one local invocation. Real payloads carry a unique id +// (messageId/seq), so this is not a practical concern; it is the same +// trade-off documented on dispatchDedupKey. +const ( + // localDedupTTL bounds how long a (namespace, topic, payload) claim is + // remembered on this node. It must cover gossipsub self-delivery / + // fan-out jitter without de-duplicating legitimately-repeated publishes + // seconds apart. Kept in lockstep with dispatchDedupTTL. + localDedupTTL = 30 * time.Second + + // localDedupMaxEntries caps the cache so a high-throughput namespace + // can't grow it without bound. When the cap is hit, expired entries are + // swept first; if still full, the claim is allowed through (fail-open — + // a rare duplicate is far better than dropping a wake). + localDedupMaxEntries = 4096 +) + +// localDedupCache is a bounded, TTL'd set of recently-dispatched keys for a +// single node. Safe for concurrent use. +type localDedupCache struct { + mu sync.Mutex + entries map[string]time.Time // key -> expiry + ttl time.Duration + maxSize int + now func() time.Time // injectable clock for tests +} + +// newLocalDedupCache builds a cache with the package default TTL and size. +func newLocalDedupCache() *localDedupCache { + return &localDedupCache{ + entries: make(map[string]time.Time), + ttl: localDedupTTL, + maxSize: localDedupMaxEntries, + now: time.Now, + } +} + +// claim records the key and reports whether THIS node may dispatch it now. +// +// Returns true the first time a key is seen within the TTL window (caller +// should dispatch) and false on subsequent calls within the window (caller +// should skip — it's a local duplicate). +// +// Fail-open: if the cache is at capacity and can't be swept enough to make +// room, claim returns true (allow dispatch) rather than risk dropping a +// legitimate wake. +func (c *localDedupCache) claim(key string) bool { + c.mu.Lock() + defer c.mu.Unlock() + + now := c.now() + if exp, ok := c.entries[key]; ok && now.Before(exp) { + return false // seen recently → local duplicate → skip + } + + // Either unseen or the previous entry expired. Sweep expired entries + // before inserting so the map doesn't accumulate dead keys. + if len(c.entries) >= c.maxSize { + c.sweepExpiredLocked(now) + } + if len(c.entries) >= c.maxSize { + // Still full of live entries — allow dispatch rather than drop. + return true + } + + c.entries[key] = now.Add(c.ttl) + return true +} + +// sweepExpiredLocked removes expired entries. Caller must hold c.mu. +func (c *localDedupCache) sweepExpiredLocked(now time.Time) { + for k, exp := range c.entries { + if !now.Before(exp) { + delete(c.entries, k) + } + } +} diff --git a/core/pkg/serverless/triggers/local_dedup_test.go b/core/pkg/serverless/triggers/local_dedup_test.go new file mode 100644 index 0000000..80f80f6 --- /dev/null +++ b/core/pkg/serverless/triggers/local_dedup_test.go @@ -0,0 +1,140 @@ +package triggers + +import ( + "sync" + "sync/atomic" + "testing" + "time" +) + +// Bugboard #555 — a SINGLE node must never invoke the same publish twice, +// independent of Olric health. These tests pin the local dedup cache's +// claim/expiry/eviction behavior. + +func TestLocalDedupCache_sameKeyClaimedOncePerWindow(t *testing.T) { + c := newLocalDedupCache() + key := dispatchDedupKey("ns", "messages:new", []byte(`{"id":1}`)) + + if !c.claim(key) { + t.Fatal("first claim of an unseen key must fire (return true)") + } + if c.claim(key) { + t.Error("second claim within the TTL must be deduped (return false)") + } +} + +func TestLocalDedupCache_distinctKeysBothFire(t *testing.T) { + c := newLocalDedupCache() + a := dispatchDedupKey("ns", "messages:new", []byte("A")) + b := dispatchDedupKey("ns", "messages:new", []byte("B")) + + if !c.claim(a) { + t.Error("distinct payload A must fire") + } + if !c.claim(b) { + t.Error("distinct payload B must fire (different payload → different key)") + } +} + +func TestLocalDedupCache_expiredEntryFiresAgain(t *testing.T) { + // Drive a controllable clock so we don't sleep in the test. + cur := time.Unix(1_000_000, 0) + c := newLocalDedupCache() + c.now = func() time.Time { return cur } + + key := dispatchDedupKey("ns", "messages:new", []byte("x")) + if !c.claim(key) { + t.Fatal("first claim must fire") + } + if c.claim(key) { + t.Fatal("immediate re-claim must be deduped") + } + + // Advance past the TTL: the entry has expired, so the same key must + // fire again (a legitimately-repeated publish seconds apart). + cur = cur.Add(localDedupTTL + time.Second) + if !c.claim(key) { + t.Error("after TTL expiry the same key must fire again") + } +} + +func TestLocalDedupCache_evictsExpiredOnPressure(t *testing.T) { + cur := time.Unix(2_000_000, 0) + c := &localDedupCache{ + entries: make(map[string]time.Time), + ttl: localDedupTTL, + maxSize: 4, // tiny cap to exercise the sweep path deterministically + now: func() time.Time { return cur }, + } + + // Fill to capacity with soon-to-expire entries. + for i := 0; i < c.maxSize; i++ { + key := dispatchDedupKey("ns", "t", []byte{byte(i)}) + if !c.claim(key) { + t.Fatalf("fill claim %d must fire", i) + } + } + if len(c.entries) != c.maxSize { + t.Fatalf("expected cache full at %d, got %d", c.maxSize, len(c.entries)) + } + + // Advance past TTL so every existing entry is expired, then claim a new + // key: the sweep must reclaim space and the new key must be recorded. + cur = cur.Add(localDedupTTL + time.Second) + newKey := dispatchDedupKey("ns", "t", []byte("fresh")) + if !c.claim(newKey) { + t.Fatal("new key under pressure must fire") + } + if _, ok := c.entries[newKey]; !ok { + t.Error("new key must be recorded after expired entries were swept") + } + if len(c.entries) > c.maxSize { + t.Errorf("cache must not exceed maxSize after sweep; got %d", len(c.entries)) + } +} + +func TestLocalDedupCache_concurrentClaimsExactlyOneWins(t *testing.T) { + // Race condition guard: when many goroutines race to claim the SAME key + // (gossipsub delivering one publish across handler goroutines), exactly + // one must win. Run under -race to catch unsynchronized map access. + c := newLocalDedupCache() + key := dispatchDedupKey("ns", "messages:new", []byte(`{"id":"race"}`)) + + const goroutines = 64 + var wins int64 + var wg sync.WaitGroup + wg.Add(goroutines) + for i := 0; i < goroutines; i++ { + go func() { + defer wg.Done() + if c.claim(key) { + atomic.AddInt64(&wins, 1) + } + }() + } + wg.Wait() + + if wins != 1 { + t.Errorf("exactly one concurrent claim of the same key must win; got %d", wins) + } +} + +func TestLocalDedupCache_failsOpenWhenFullOfLiveEntries(t *testing.T) { + cur := time.Unix(3_000_000, 0) + c := &localDedupCache{ + entries: make(map[string]time.Time), + ttl: localDedupTTL, + maxSize: 2, + now: func() time.Time { return cur }, + } + + // Fill with two still-live entries. + c.claim(dispatchDedupKey("ns", "t", []byte("a"))) + c.claim(dispatchDedupKey("ns", "t", []byte("b"))) + + // A new key when the cache is full of LIVE entries must fail-open + // (fire) rather than drop a legitimate wake. + if !c.claim(dispatchDedupKey("ns", "t", []byte("c"))) { + t.Error("claim must fail-open (true) when the cache is full of live entries") + } +} diff --git a/core/pkg/serverless/types.go b/core/pkg/serverless/types.go index ec384b9..19c9455 100644 --- a/core/pkg/serverless/types.go +++ b/core/pkg/serverless/types.go @@ -237,6 +237,11 @@ type FunctionDefinition struct { WSIdleTimeoutSec int `json:"ws_idle_timeout_sec,omitempty"` // 0 = no idle timeout WSMaxFrameBytes int `json:"ws_max_frame_bytes,omitempty"` // 0 = use default 256 KB WSMaxInflightPerConn int `json:"ws_max_inflight_per_conn,omitempty"` // 0 = use default 64 + + // RawHTTPResponse enables raw-HTTP-response mode (bugboard #835): the + // function may call set_http_response to emit a verbatim status/headers/ + // body instead of the JSON/Ack-wrapped output. See pkg/serverless/raw_http.go. + RawHTTPResponse bool `json:"raw_http_response,omitempty"` } // DBTriggerConfig defines a database trigger configuration. @@ -270,6 +275,11 @@ type Function struct { WSIdleTimeoutSec int `json:"ws_idle_timeout_sec,omitempty"` WSMaxFrameBytes int `json:"ws_max_frame_bytes,omitempty"` WSMaxInflightPerConn int `json:"ws_max_inflight_per_conn,omitempty"` + + // RawHTTPResponse — bugboard #835. When true, the function may emit a + // verbatim HTTP response via set_http_response instead of the + // JSON/Ack-wrapped output. See pkg/serverless/raw_http.go. + RawHTTPResponse bool `json:"raw_http_response,omitempty"` } // InvocationContext provides context for a function invocation. @@ -308,6 +318,14 @@ type InvocationContext struct { // could create by publishing topics that match its own wildcard // trigger (bugboard #93 follow-up). TriggerDepth int `json:"trigger_depth,omitempty"` + + // RawHTTP carries a verbatim HTTP response set by a RawHTTPResponse + // function (bugboard #835). The engine populates this from the + // per-invocation collector after Execute returns; the Invoker surfaces + // it on InvokeResponse so the HTTP handler can replay it. nil/unset for + // normal functions and functions that didn't call set_http_response. + // Not serialized — internal plumbing only. + RawHTTP *RawHTTPResult `json:"-"` } // InvocationResult represents the result of a function invocation. @@ -555,6 +573,28 @@ type HostServices interface { // in OnClose unless they want to dynamically unsubscribe. WSPubSubUnbridge(ctx context.Context, clientID, topic string) error + // SetHTTPResponse records a verbatim HTTP response (status, headers, body) + // for a RawHTTPResponse function (bugboard #835). The HTTP invoke handler + // replays it byte-for-byte instead of the JSON/Ack-wrapped output, so a + // function can transparently proxy an upstream RPC. Returns an error when + // the function is NOT deployed with raw_http_response, or when the status / + // header count / body size fail validation. headers may be nil. + SetHTTPResponse(ctx context.Context, status int, headers map[string]string, body []byte) error + + // EphemeralStateSet records WS-subscribe-tracked ephemeral state owned by + // the current invocation's WS client (bugboard #710) and publishes a "set" + // event on the topic so subscribers observe it. The state auto-clears (with + // a synthetic "clear" event) when the owning WS client disconnects, and + // also expires after ttlMs (clamped to a max; <=0 uses a default). Returns + // an error when there is no WS client in context, on empty topic/key, on an + // oversized payload, or when the client's per-connection key cap is hit. + EphemeralStateSet(ctx context.Context, topic, key string, payload []byte, ttlMs int64) error + + // EphemeralStateClear removes ephemeral state the current WS client owns + // and publishes a "clear" event. Idempotent: clearing a missing or + // non-owned key is a no-op. Errors only on no-WS-client / empty topic-key. + EphemeralStateClear(ctx context.Context, topic, key string) error + // WebSocket operations (only valid in WS context) WSSend(ctx context.Context, clientID string, data []byte) error WSBroadcast(ctx context.Context, topic string, data []byte) error diff --git a/core/pkg/serverless/websocket.go b/core/pkg/serverless/websocket.go index 4bfcd5f..7428894 100644 --- a/core/pkg/serverless/websocket.go +++ b/core/pkg/serverless/websocket.go @@ -23,6 +23,14 @@ type WSManager struct { subscriptions map[string]map[string]struct{} subscriptionsMu sync.RWMutex + // disconnectHooks run (synchronously) on Unregister for each client, + // AFTER the connection + subscriptions are torn down. Used by the + // ephemeral-state store (bugboard #710) to auto-clear a client's owned + // state on disconnect. Both the stateless and persistent WS handlers + // call Unregister, so a single hook covers both paths. + disconnectHooks []func(clientID string) + disconnectHooksMu sync.RWMutex + logger *zap.Logger } @@ -102,6 +110,20 @@ func (m *WSManager) Register(clientID string, conn WebSocketConn) { ) } +// AddDisconnectHook registers a callback fired (synchronously) for every +// client passed to Unregister, after its connection + subscriptions are torn +// down. Used to auto-clear WS-subscribe-tracked ephemeral state on disconnect +// (bugboard #710). Hooks must be cheap and non-blocking — they run inline on +// the WS read loop's teardown path. Register once at gateway init. +func (m *WSManager) AddDisconnectHook(hook func(clientID string)) { + if hook == nil { + return + } + m.disconnectHooksMu.Lock() + m.disconnectHooks = append(m.disconnectHooks, hook) + m.disconnectHooksMu.Unlock() +} + // Unregister removes a WebSocket connection and its subscriptions. func (m *WSManager) Unregister(clientID string) { m.connectionsMu.Lock() @@ -130,6 +152,14 @@ func (m *WSManager) Unregister(clientID string) { // Close the connection _ = conn.conn.Close() + // Fire disconnect hooks (ephemeral-state auto-clear, bugboard #710). + m.disconnectHooksMu.RLock() + hooks := m.disconnectHooks + m.disconnectHooksMu.RUnlock() + for _, hook := range hooks { + hook(clientID) + } + m.logger.Debug("Unregistered WebSocket connection", zap.String("client_id", clientID), zap.Int("remaining_connections", m.GetConnectionCount()),