fix(config): add secrets_encryption_key to HTTPGatewayConfig — fixes orama-node boot crash

v0.122.42 (f412425, secrets encryption) shipped the template emission, the per-cluster secret generator, and the gateway.Config consumer — but NOT the parse field on config.HTTPGatewayConfig. Phase 4 writes `secrets_encryption_key` into node.yaml under the http_gateway section, and pkg/config/yaml.go decodes with KnownFields(true) (strict). The unknown field made every node.yaml parse fail, so orama-node exited 1 on every start and systemd crash-looped it (restart counter hit 380+ on the first upgraded devnet node before the rolling controller halted). Root cause: a generated-config field with no matching struct field under strict unmarshal. Fix is the missing field. The runtime key itself is still consumed from ~/.orama/secrets/secrets-encryption-key (pkg/node/ gateway.go), which already worked — so this one-field addition fully restores boot AND the feature. The standalone gateway (cmd/gateway/config.go) uses lenient parsing and was unaffected. Regression test in pkg/config/decode_test.go decodes a node.yaml carrying secrets_encryption_key under strict mode.
2026-06-16 21:54:14 +00:00 · 2026-06-09 15:57:32 +03:00 · 2026-06-09 15:57:32 +03:00 · e685c864fc
commit e685c864fc
parent b6b518e005
8 changed files with 173 additions and 18 deletions
--- a/core/pkg/config/decode_test.go
+++ b/core/pkg/config/decode_test.go
@ -207,3 +207,30 @@ key2: value2
 		t.Errorf("expected key2='value2', got %q", result["key2"])
 	}
 }
+
+// TestDecodeStrict_secretsEncryptionKey is the regression guard for the
+// v0.122.42 boot crash: Phase 4 config generation writes
+// `secrets_encryption_key` into node.yaml under the http_gateway section,
+// but HTTPGatewayConfig had no matching field. With KnownFields(true)
+// strict decoding, the unknown field made DecodeStrict fail and
+// orama-node crash-looped (exit 1) on every start. The field must parse.
+func TestDecodeStrict_secretsEncryptionKey(t *testing.T) {
+	yamlInput := `
+node:
+  id: "test-node"
+  data_dir: "./data"
+http_gateway:
+  enabled: true
+  client_namespace: "default"
+  rqlite_dsn: "http://localhost:5001"
+  secrets_encryption_key: "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef"
+`
+	var cfg Config
+	if err := DecodeStrict(strings.NewReader(yamlInput), &cfg); err != nil {
+		t.Fatalf("node.yaml with secrets_encryption_key must parse (v0.122.42 regression), got: %v", err)
+	}
+	want := "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef"
+	if cfg.HTTPGateway.SecretsEncryptionKey != want {
+		t.Errorf("SecretsEncryptionKey = %q, want %q", cfg.HTTPGateway.SecretsEncryptionKey, want)
+	}
+}
--- a/core/pkg/config/gateway_config.go
+++ b/core/pkg/config/gateway_config.go
@ -21,6 +21,15 @@ type HTTPGatewayConfig struct {
 	IPFSTimeout       time.Duration `yaml:"ipfs_timeout"`         // Timeout for IPFS operations
 	BaseDomain        string        `yaml:"base_domain"`          // Base domain for deployments (e.g., "dbrs.space"). Defaults to "dbrs.space"

+	// SecretsEncryptionKey is the AES-256 key (hex, 64 chars) used to encrypt
+	// serverless function secrets at rest. Generated per-cluster and written
+	// into node.yaml by Phase 4 config generation. This field MUST exist or
+	// strict YAML unmarshal rejects node.yaml entirely and orama-node fails
+	// to boot (regression that shipped in v0.122.42: template + secret
+	// generator + gateway.Config consumer all landed, but this parse field
+	// and the node→gateway mapping were missed).
+	SecretsEncryptionKey string `yaml:"secrets_encryption_key"`
+
 	// WebRTC configuration (optional, enabled per-namespace)
 	WebRTC WebRTCConfig `yaml:"webrtc"`
 }
--- a/core/pkg/gateway/dependencies.go
+++ b/core/pkg/gateway/dependencies.go
@ -468,6 +468,13 @@ func initializeServerless(logger *logging.ColoredLogger, cfg *Config, deps *Depe
 	engineCfg.DefaultTimeoutSeconds = 30
 	engineCfg.MaxTimeoutSeconds = 60
 	engineCfg.ModuleCacheSize = 100
+	// Surface the per-phase slow-invoke diagnostic (instantiate_ms / run_ms)
+	// above 1s instead of the 5s default — a >1s serverless invocation is
+	// genuinely slow (well-built handlers are <300ms), and this makes the
+	// cold-start floor (bugboard #27: async-dispatched stateless handlers pay a
+	// fresh instantiate + TinyGo _start per call) visible for correlation
+	// against client-side request_ids.
+	engineCfg.SlowInvokeThresholdMs = 1000

 	// Create secrets manager for serverless functions (AES-256-GCM encrypted).
 	//
--- a/core/pkg/serverless/config.go
+++ b/core/pkg/serverless/config.go
@ -56,6 +56,14 @@ type Config struct {
 	ModuleCacheSize int  `yaml:"module_cache_size"` // Number of compiled modules to cache
 	EnablePrewarm   bool `yaml:"enable_prewarm"`    // Pre-compile frequently used functions

+	// SlowInvokeThresholdMs is the wall-clock (ms) above which Execute emits the
+	// per-phase "slow invocation" diagnostic (bugboard #24/#27). Default 5000.
+	// Lower it (e.g. 750) to surface the sub-second cold-start floor that the
+	// 5s default hides — async-dispatched stateless handlers pay a fresh
+	// instantiate + TinyGo _start per call, which a count=0 read makes visible
+	// as ~1s of execute time with ~0 module-load (compile is cached). See #27.
+	SlowInvokeThresholdMs int `yaml:"slow_invoke_threshold_ms"`
+
 	// Secrets encryption
 	SecretsEncryptionKey string `yaml:"secrets_encryption_key"` // AES-256 key (32 bytes, hex-encoded)

@ -79,6 +87,12 @@ type Config struct {
 // `*/1 * * * * *` to fire on every-second cadence.
 const MinCronPollInterval = 100 * time.Millisecond

+// defaultSlowInvokeThresholdMs is the default wall-clock (ms) above which the
+// per-phase slow-invocation diagnostic fires. 5s keeps normal traffic quiet
+// while still firing before the 30s WS ceiling; lower it on a cluster under
+// investigation to surface sub-second cold-start floors.
+const defaultSlowInvokeThresholdMs = 5000
+
 // DefaultConfig returns a configuration with sensible defaults.
 func DefaultConfig() *Config {
 	return &Config{
@ -113,8 +127,9 @@ func DefaultConfig() *Config {
 		MaxConcurrentExecutions: 10,

 		// WASM cache
-		ModuleCacheSize: 100,
-		EnablePrewarm:   true,
+		ModuleCacheSize:       100,
+		EnablePrewarm:         true,
+		SlowInvokeThresholdMs: defaultSlowInvokeThresholdMs,

 		// Logging
 		LogInvocations: true,
@ -208,6 +223,9 @@ func (c *Config) ApplyDefaults() {
 	if c.ModuleCacheSize == 0 {
 		c.ModuleCacheSize = defaults.ModuleCacheSize
 	}
+	if c.SlowInvokeThresholdMs == 0 {
+		c.SlowInvokeThresholdMs = defaults.SlowInvokeThresholdMs
+	}
 	if c.LogRetention == 0 {
 		c.LogRetention = defaults.LogRetention
 	}
--- a/core/pkg/serverless/engine.go
+++ b/core/pkg/serverless/engine.go
@ -222,12 +222,17 @@ func NewEngine(cfg *Config, registry FunctionRegistry, hostServices HostServices
 	return engine, nil
 }

-// slowInvokeThreshold is the wall-clock duration above which Execute
-// emits a structured "slow invocation" warning with per-phase
-// breakdown. Picked to be well below the WS-handler's 30s ceiling
-// (bugboard #24) so we get diagnostic logs BEFORE the timeout actually
-// fires, surfacing which phase is the sink.
-const slowInvokeThreshold = 5 * time.Second
+// slowInvokeThreshold returns the wall-clock duration above which Execute
+// emits a structured "slow invocation" warning with per-phase breakdown.
+// Sourced from config (SlowInvokeThresholdMs) so a cluster under
+// investigation can lower it to surface the sub-second cold-start floor that
+// the 5s default hides (bugboard #27). Defaults to 5s when unset.
+func (e *Engine) slowInvokeThreshold() time.Duration {
+	if e.config != nil && e.config.SlowInvokeThresholdMs > 0 {
+		return time.Duration(e.config.SlowInvokeThresholdMs) * time.Millisecond
+	}
+	return defaultSlowInvokeThresholdMs * time.Millisecond
+}

 // Execute runs a function with the given input and returns the output.
 //
@ -332,7 +337,7 @@ func (e *Engine) Execute(ctx context.Context, fn *Function, input []byte, invCtx
 	module, err := e.getOrCompileModule(execCtx, fn.WASMCID)
 	if err != nil {
 		e.logInvocation(ctx, fn, invCtx, logBuf, startTime, 0, InvocationStatusError, err)
-		e.logSlowInvocation(invCtx, startTime, ratelimitDoneAt, moduleLoadedAt, executeDoneAt, "module-load-failed", err)
+		e.logSlowInvocation(invCtx, startTime, ratelimitDoneAt, moduleLoadedAt, executeDoneAt, 0, "module-load-failed", err)
 		return nil, &ExecutionError{FunctionName: fn.Name, RequestID: invCtx.RequestID, Cause: err}
 	}
 	moduleLoadedAt = time.Now()
@ -343,6 +348,10 @@ func (e *Engine) Execute(ctx context.Context, fn *Function, input []byte, invCtx
 		contextSetter = func() { hf.SetInvocationContext(invCtx) }
 		contextClearer = func() { hf.ClearContext() }
 	}
+	// Attach a collector so ExecuteModule reports how long instantiate (TinyGo
+	// _start cold-start) took, letting the slow-invoke diagnostic split the
+	// execute phase into cold-start vs handler work (bugboard #27).
+	execCtx, instTiming := execution.WithInstantiateTiming(execCtx)
 	output, err := e.executor.ExecuteModule(execCtx, module, fn.Name, input, contextSetter, contextClearer)
 	executeDoneAt = time.Now()
 	if err != nil {
@ -352,7 +361,7 @@ func (e *Engine) Execute(ctx context.Context, fn *Function, input []byte, invCtx
 			err = ErrTimeout
 		}
 		e.logInvocation(ctx, fn, invCtx, logBuf, startTime, len(output), status, err)
-		e.logSlowInvocation(invCtx, startTime, ratelimitDoneAt, moduleLoadedAt, executeDoneAt, string(status), err)
+		e.logSlowInvocation(invCtx, startTime, ratelimitDoneAt, moduleLoadedAt, executeDoneAt, instTiming.InstantiateNs, string(status), err)
 		return nil, &ExecutionError{FunctionName: fn.Name, RequestID: invCtx.RequestID, Cause: err}
 	}

@ -365,7 +374,7 @@ func (e *Engine) Execute(ctx context.Context, fn *Function, input []byte, invCtx
 	}

 	e.logInvocation(ctx, fn, invCtx, logBuf, startTime, len(output), InvocationStatusSuccess, nil)
-	e.logSlowInvocation(invCtx, startTime, ratelimitDoneAt, moduleLoadedAt, executeDoneAt, "success", nil)
+	e.logSlowInvocation(invCtx, startTime, ratelimitDoneAt, moduleLoadedAt, executeDoneAt, instTiming.InstantiateNs, "success", nil)
 	return output, nil
 }

@ -379,9 +388,9 @@ func (e *Engine) Execute(ctx context.Context, fn *Function, input []byte, invCtx
 // Zero-valued phase timestamps mean the phase was never reached, which
 // is itself signal — e.g. moduleLoadedAt=zero + executeDoneAt=zero with
 // large totalMs means we blocked in rate-limit OR module-load.
-func (e *Engine) logSlowInvocation(invCtx *InvocationContext, startTime, ratelimitDoneAt, moduleLoadedAt, executeDoneAt time.Time, status string, err error) {
+func (e *Engine) logSlowInvocation(invCtx *InvocationContext, startTime, ratelimitDoneAt, moduleLoadedAt, executeDoneAt time.Time, instantiateNs int64, status string, err error) {
 	totalMs := time.Since(startTime).Milliseconds()
-	if totalMs < slowInvokeThreshold.Milliseconds() {
+	if totalMs < e.slowInvokeThreshold().Milliseconds() {
 		return
 	}
 	// Compute phase deltas. Use 0 for unreached phases so the log line
@ -396,6 +405,15 @@ func (e *Engine) logSlowInvocation(invCtx *InvocationContext, startTime, ratelim
 	if !executeDoneAt.IsZero() && !moduleLoadedAt.IsZero() {
 		executeMs = executeDoneAt.Sub(moduleLoadedAt).Milliseconds()
 	}
+	// Split execute into instantiate (TinyGo _start cold-start) vs run
+	// (handler logic). A count=0 read with instantiate_ms ≈ execute_ms and
+	// run_ms ≈ 0 is the bugboard #27 cold-start floor — the per-call fresh
+	// instantiation, not the handler, is the sink.
+	instantiateMs := instantiateNs / int64(time.Millisecond)
+	runMs := executeMs - instantiateMs
+	if runMs < 0 {
+		runMs = 0
+	}
 	fields := []zap.Field{
 		zap.String("namespace", invCtx.Namespace),
 		zap.String("function", invCtx.FunctionName),
@ -406,6 +424,8 @@ func (e *Engine) logSlowInvocation(invCtx *InvocationContext, startTime, ratelim
 		zap.Int64("ratelimit_ms", ratelimitMs),
 		zap.Int64("module_load_ms", moduleLoadMs),
 		zap.Int64("execute_ms", executeMs),
+		zap.Int64("instantiate_ms", instantiateMs),
+		zap.Int64("run_ms", runMs),
 		zap.String("invocation_status", status),
 	}
 	if err != nil {
--- a/core/pkg/serverless/engine_slow_invoke_test.go
+++ b/core/pkg/serverless/engine_slow_invoke_test.go
@ -30,7 +30,7 @@ func TestLogSlowInvocation_belowThresholdEmitsNothing(t *testing.T) {
 	invCtx := &InvocationContext{Namespace: "ns", FunctionName: "fast-fn"}

 	now := time.Now()
-	e.logSlowInvocation(invCtx, now, now.Add(1*time.Millisecond), now.Add(2*time.Millisecond), now.Add(100*time.Millisecond), "success", nil)
+	e.logSlowInvocation(invCtx, now, now.Add(1*time.Millisecond), now.Add(2*time.Millisecond), now.Add(100*time.Millisecond), 0, "success", nil)

 	if got := observed.Len(); got != 0 {
 		t.Errorf("fast invocation (100ms < 5s threshold) emitted %d log lines; want 0", got)
@ -51,12 +51,15 @@ func TestLogSlowInvocation_aboveThresholdEmitsBreakdown(t *testing.T) {
 		WSClientID:   "ws-client-xyz",
 	}

-	// Simulate a 30s-class invocation that spent the bulk in execute.
+	// Simulate a 30s-class invocation that spent the bulk in execute, of which
+	// nearly all was cold-start instantiation (the bugboard #27 floor pattern):
+	// instantiate_ms ≈ execute_ms, run_ms ≈ 0.
 	start := time.Now().Add(-30 * time.Second)
 	ratelimitDone := start.Add(50 * time.Millisecond)
 	moduleLoaded := start.Add(150 * time.Millisecond)
 	executeDone := start.Add(30 * time.Second)
-	e.logSlowInvocation(invCtx, start, ratelimitDone, moduleLoaded, executeDone, "timeout", nil)
+	instantiateNs := (29*time.Second + 800*time.Millisecond).Nanoseconds()
+	e.logSlowInvocation(invCtx, start, ratelimitDone, moduleLoaded, executeDone, instantiateNs, "timeout", nil)

 	logs := observed.All()
 	if len(logs) != 1 {
@ -96,6 +99,18 @@ func TestLogSlowInvocation_aboveThresholdEmitsBreakdown(t *testing.T) {
 	if executeMs < 29000 || executeMs > 30000 {
 		t.Errorf("execute_ms = %d; want ~29900 (proves the phase-breakdown points at execute)", executeMs)
 	}
+
+	// The #27 cold-start split: instantiate dominates execute, run ≈ 0. This is
+	// the field AnChat needs to distinguish "fresh instantiate is the sink"
+	// from "the handler is slow".
+	instantiateMs, _ := contextMap["instantiate_ms"].(int64)
+	runMs, _ := contextMap["run_ms"].(int64)
+	if instantiateMs < 29000 || instantiateMs > 30000 {
+		t.Errorf("instantiate_ms = %d; want ~29800 (cold-start dominates execute)", instantiateMs)
+	}
+	if runMs < 0 || runMs > 500 {
+		t.Errorf("run_ms = %d; want ~100 (handler logic is trivial; cold-start is the sink)", runMs)
+	}
 }

 func TestLogSlowInvocation_zeroPhaseTimestampsMeanUnreached(t *testing.T) {
@ -112,7 +127,7 @@ func TestLogSlowInvocation_zeroPhaseTimestampsMeanUnreached(t *testing.T) {
 	start := time.Now().Add(-10 * time.Second)
 	ratelimitDone := start.Add(100 * time.Millisecond)
 	// moduleLoadedAt and executeDoneAt left as zero — module-load failed
-	e.logSlowInvocation(invCtx, start, ratelimitDone, time.Time{}, time.Time{}, "module-load-failed", nil)
+	e.logSlowInvocation(invCtx, start, ratelimitDone, time.Time{}, time.Time{}, 0, "module-load-failed", nil)

 	logs := observed.All()
 	if len(logs) != 1 {
--- a/core/pkg/serverless/execution/executor.go
+++ b/core/pkg/serverless/execution/executor.go
@ -6,12 +6,36 @@ import (
 	cryptorand "crypto/rand"
 	"encoding/json"
 	"fmt"
+	"time"

 	"github.com/tetratelabs/wazero"
 	"github.com/tetratelabs/wazero/api"
 	"go.uber.org/zap"
 )

+// InstantiateTiming captures how long the per-invocation wazero
+// InstantiateModule call took (running TinyGo _start / package init). It rides
+// the ctx so the engine's slow-invoke diagnostic can split the execute phase
+// into cold-start (instantiate) vs handler work (run) — the distinction that
+// pins the bugboard #27 cold-start floor. Nil collector = not measured.
+type InstantiateTiming struct {
+	InstantiateNs int64
+}
+
+type instantiateTimingKey struct{}
+
+// WithInstantiateTiming returns a ctx carrying a fresh InstantiateTiming that
+// ExecuteModule will fill in. The caller reads it back after ExecuteModule.
+func WithInstantiateTiming(ctx context.Context) (context.Context, *InstantiateTiming) {
+	t := &InstantiateTiming{}
+	return context.WithValue(ctx, instantiateTimingKey{}, t), t
+}
+
+func instantiateTimingFrom(ctx context.Context) *InstantiateTiming {
+	t, _ := ctx.Value(instantiateTimingKey{}).(*InstantiateTiming)
+	return t
+}
+
 // Executor handles WASM module execution.
 type Executor struct {
 	runtime wazero.Runtime
@ -101,8 +125,14 @@ func (e *Executor) ExecuteModule(ctx context.Context, compiled wazero.CompiledMo
 		}
 	}

-	// Instantiate and run the module (WASI _start will be called automatically)
+	// Instantiate and run the module (WASI _start will be called automatically).
+	// Time the instantiate so the engine can attribute cold-start vs handler
+	// work (bugboard #27 cold-start floor); no-op when no collector is attached.
+	instStart := time.Now()
 	instance, err := e.runtime.InstantiateModule(ctx, compiled, moduleConfig)
+	if t := instantiateTimingFrom(ctx); t != nil {
+		t.InstantiateNs = time.Since(instStart).Nanoseconds()
+	}
 	if err != nil {
 		// Check if stderr has any output
 		if stderr.Len() > 0 {
--- a/core/pkg/serverless/execution/randsource_test.go
+++ b/core/pkg/serverless/execution/randsource_test.go
@ -8,6 +8,7 @@ import (

 	"github.com/tetratelabs/wazero"
 	"github.com/tetratelabs/wazero/imports/wasi_snapshot_preview1"
+	"go.uber.org/zap"
 )

 // Bugboard #120 — wazero defaults to a DETERMINISTIC (zero-seed) RNG source.
@ -179,3 +180,31 @@ func TestModuleConfig_randWithoutFix_demoDeterministic(t *testing.T) {
 	}
 	// Determinism confirmed → fix is meaningful.
 }
+
+// Bugboard #27 instrumentation: ExecuteModule must record how long the
+// per-invocation InstantiateModule (TinyGo _start cold-start) took into the
+// ctx-attached collector, so the engine can split the execute phase into
+// cold-start vs handler work. Without an attached collector it must be a no-op.
+func TestExecuteModule_recordsInstantiateTiming(t *testing.T) {
+	ctx := context.Background()
+	runtime := wazero.NewRuntime(ctx)
+	defer runtime.Close(ctx)
+	if _, err := wasi_snapshot_preview1.Instantiate(ctx, runtime); err != nil {
+		t.Fatalf("instantiate WASI: %v", err)
+	}
+	compiled, err := runtime.CompileModule(ctx, randProbeWasm)
+	if err != nil {
+		t.Fatalf("compile probe wasm: %v", err)
+	}
+	defer compiled.Close(ctx)
+
+	ex := NewExecutor(runtime, zap.NewNop(), 0)
+
+	tctx, timing := WithInstantiateTiming(ctx)
+	if _, err := ex.ExecuteModule(tctx, compiled, "probe", nil, nil, nil); err != nil {
+		t.Fatalf("ExecuteModule: %v", err)
+	}
+	if timing.InstantiateNs <= 0 {
+		t.Errorf("InstantiateNs = %d; want > 0 (instantiate duration must be recorded)", timing.InstantiateNs)
+	}
+}