fix(config): add secrets_encryption_key to HTTPGatewayConfig — fixes orama-node boot crash

v0.122.42 (f412425, secrets encryption) shipped the template emission,
the per-cluster secret generator, and the gateway.Config consumer — but
NOT the parse field on config.HTTPGatewayConfig. Phase 4 writes
`secrets_encryption_key` into node.yaml under the http_gateway section,
and pkg/config/yaml.go decodes with KnownFields(true) (strict). The
unknown field made every node.yaml parse fail, so orama-node exited 1
on every start and systemd crash-looped it (restart counter hit 380+ on
the first upgraded devnet node before the rolling controller halted).

Root cause: a generated-config field with no matching struct field under
strict unmarshal. Fix is the missing field. The runtime key itself is
still consumed from ~/.orama/secrets/secrets-encryption-key (pkg/node/
gateway.go), which already worked — so this one-field addition fully
restores boot AND the feature.

The standalone gateway (cmd/gateway/config.go) uses lenient parsing and
was unaffected.

Regression test in pkg/config/decode_test.go decodes a node.yaml
carrying secrets_encryption_key under strict mode.
This commit is contained in:
anonpenguin23 2026-06-09 15:57:32 +03:00
parent b6b518e005
commit e685c864fc
8 changed files with 173 additions and 18 deletions

View File

@ -207,3 +207,30 @@ key2: value2
t.Errorf("expected key2='value2', got %q", result["key2"])
}
}
// TestDecodeStrict_secretsEncryptionKey is the regression guard for the
// v0.122.42 boot crash: Phase 4 config generation writes
// `secrets_encryption_key` into node.yaml under the http_gateway section,
// but HTTPGatewayConfig had no matching field. With KnownFields(true)
// strict decoding, the unknown field made DecodeStrict fail and
// orama-node crash-looped (exit 1) on every start. The field must parse.
func TestDecodeStrict_secretsEncryptionKey(t *testing.T) {
yamlInput := `
node:
id: "test-node"
data_dir: "./data"
http_gateway:
enabled: true
client_namespace: "default"
rqlite_dsn: "http://localhost:5001"
secrets_encryption_key: "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef"
`
var cfg Config
if err := DecodeStrict(strings.NewReader(yamlInput), &cfg); err != nil {
t.Fatalf("node.yaml with secrets_encryption_key must parse (v0.122.42 regression), got: %v", err)
}
want := "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef"
if cfg.HTTPGateway.SecretsEncryptionKey != want {
t.Errorf("SecretsEncryptionKey = %q, want %q", cfg.HTTPGateway.SecretsEncryptionKey, want)
}
}

View File

@ -21,6 +21,15 @@ type HTTPGatewayConfig struct {
IPFSTimeout time.Duration `yaml:"ipfs_timeout"` // Timeout for IPFS operations
BaseDomain string `yaml:"base_domain"` // Base domain for deployments (e.g., "dbrs.space"). Defaults to "dbrs.space"
// SecretsEncryptionKey is the AES-256 key (hex, 64 chars) used to encrypt
// serverless function secrets at rest. Generated per-cluster and written
// into node.yaml by Phase 4 config generation. This field MUST exist or
// strict YAML unmarshal rejects node.yaml entirely and orama-node fails
// to boot (regression that shipped in v0.122.42: template + secret
// generator + gateway.Config consumer all landed, but this parse field
// and the node→gateway mapping were missed).
SecretsEncryptionKey string `yaml:"secrets_encryption_key"`
// WebRTC configuration (optional, enabled per-namespace)
WebRTC WebRTCConfig `yaml:"webrtc"`
}

View File

@ -468,6 +468,13 @@ func initializeServerless(logger *logging.ColoredLogger, cfg *Config, deps *Depe
engineCfg.DefaultTimeoutSeconds = 30
engineCfg.MaxTimeoutSeconds = 60
engineCfg.ModuleCacheSize = 100
// Surface the per-phase slow-invoke diagnostic (instantiate_ms / run_ms)
// above 1s instead of the 5s default — a >1s serverless invocation is
// genuinely slow (well-built handlers are <300ms), and this makes the
// cold-start floor (bugboard #27: async-dispatched stateless handlers pay a
// fresh instantiate + TinyGo _start per call) visible for correlation
// against client-side request_ids.
engineCfg.SlowInvokeThresholdMs = 1000
// Create secrets manager for serverless functions (AES-256-GCM encrypted).
//

View File

@ -56,6 +56,14 @@ type Config struct {
ModuleCacheSize int `yaml:"module_cache_size"` // Number of compiled modules to cache
EnablePrewarm bool `yaml:"enable_prewarm"` // Pre-compile frequently used functions
// SlowInvokeThresholdMs is the wall-clock (ms) above which Execute emits the
// per-phase "slow invocation" diagnostic (bugboard #24/#27). Default 5000.
// Lower it (e.g. 750) to surface the sub-second cold-start floor that the
// 5s default hides — async-dispatched stateless handlers pay a fresh
// instantiate + TinyGo _start per call, which a count=0 read makes visible
// as ~1s of execute time with ~0 module-load (compile is cached). See #27.
SlowInvokeThresholdMs int `yaml:"slow_invoke_threshold_ms"`
// Secrets encryption
SecretsEncryptionKey string `yaml:"secrets_encryption_key"` // AES-256 key (32 bytes, hex-encoded)
@ -79,6 +87,12 @@ type Config struct {
// `*/1 * * * * *` to fire on every-second cadence.
const MinCronPollInterval = 100 * time.Millisecond
// defaultSlowInvokeThresholdMs is the default wall-clock (ms) above which the
// per-phase slow-invocation diagnostic fires. 5s keeps normal traffic quiet
// while still firing before the 30s WS ceiling; lower it on a cluster under
// investigation to surface sub-second cold-start floors.
const defaultSlowInvokeThresholdMs = 5000
// DefaultConfig returns a configuration with sensible defaults.
func DefaultConfig() *Config {
return &Config{
@ -113,8 +127,9 @@ func DefaultConfig() *Config {
MaxConcurrentExecutions: 10,
// WASM cache
ModuleCacheSize: 100,
EnablePrewarm: true,
ModuleCacheSize: 100,
EnablePrewarm: true,
SlowInvokeThresholdMs: defaultSlowInvokeThresholdMs,
// Logging
LogInvocations: true,
@ -208,6 +223,9 @@ func (c *Config) ApplyDefaults() {
if c.ModuleCacheSize == 0 {
c.ModuleCacheSize = defaults.ModuleCacheSize
}
if c.SlowInvokeThresholdMs == 0 {
c.SlowInvokeThresholdMs = defaults.SlowInvokeThresholdMs
}
if c.LogRetention == 0 {
c.LogRetention = defaults.LogRetention
}

View File

@ -222,12 +222,17 @@ func NewEngine(cfg *Config, registry FunctionRegistry, hostServices HostServices
return engine, nil
}
// slowInvokeThreshold is the wall-clock duration above which Execute
// emits a structured "slow invocation" warning with per-phase
// breakdown. Picked to be well below the WS-handler's 30s ceiling
// (bugboard #24) so we get diagnostic logs BEFORE the timeout actually
// fires, surfacing which phase is the sink.
const slowInvokeThreshold = 5 * time.Second
// slowInvokeThreshold returns the wall-clock duration above which Execute
// emits a structured "slow invocation" warning with per-phase breakdown.
// Sourced from config (SlowInvokeThresholdMs) so a cluster under
// investigation can lower it to surface the sub-second cold-start floor that
// the 5s default hides (bugboard #27). Defaults to 5s when unset.
func (e *Engine) slowInvokeThreshold() time.Duration {
if e.config != nil && e.config.SlowInvokeThresholdMs > 0 {
return time.Duration(e.config.SlowInvokeThresholdMs) * time.Millisecond
}
return defaultSlowInvokeThresholdMs * time.Millisecond
}
// Execute runs a function with the given input and returns the output.
//
@ -332,7 +337,7 @@ func (e *Engine) Execute(ctx context.Context, fn *Function, input []byte, invCtx
module, err := e.getOrCompileModule(execCtx, fn.WASMCID)
if err != nil {
e.logInvocation(ctx, fn, invCtx, logBuf, startTime, 0, InvocationStatusError, err)
e.logSlowInvocation(invCtx, startTime, ratelimitDoneAt, moduleLoadedAt, executeDoneAt, "module-load-failed", err)
e.logSlowInvocation(invCtx, startTime, ratelimitDoneAt, moduleLoadedAt, executeDoneAt, 0, "module-load-failed", err)
return nil, &ExecutionError{FunctionName: fn.Name, RequestID: invCtx.RequestID, Cause: err}
}
moduleLoadedAt = time.Now()
@ -343,6 +348,10 @@ func (e *Engine) Execute(ctx context.Context, fn *Function, input []byte, invCtx
contextSetter = func() { hf.SetInvocationContext(invCtx) }
contextClearer = func() { hf.ClearContext() }
}
// Attach a collector so ExecuteModule reports how long instantiate (TinyGo
// _start cold-start) took, letting the slow-invoke diagnostic split the
// execute phase into cold-start vs handler work (bugboard #27).
execCtx, instTiming := execution.WithInstantiateTiming(execCtx)
output, err := e.executor.ExecuteModule(execCtx, module, fn.Name, input, contextSetter, contextClearer)
executeDoneAt = time.Now()
if err != nil {
@ -352,7 +361,7 @@ func (e *Engine) Execute(ctx context.Context, fn *Function, input []byte, invCtx
err = ErrTimeout
}
e.logInvocation(ctx, fn, invCtx, logBuf, startTime, len(output), status, err)
e.logSlowInvocation(invCtx, startTime, ratelimitDoneAt, moduleLoadedAt, executeDoneAt, string(status), err)
e.logSlowInvocation(invCtx, startTime, ratelimitDoneAt, moduleLoadedAt, executeDoneAt, instTiming.InstantiateNs, string(status), err)
return nil, &ExecutionError{FunctionName: fn.Name, RequestID: invCtx.RequestID, Cause: err}
}
@ -365,7 +374,7 @@ func (e *Engine) Execute(ctx context.Context, fn *Function, input []byte, invCtx
}
e.logInvocation(ctx, fn, invCtx, logBuf, startTime, len(output), InvocationStatusSuccess, nil)
e.logSlowInvocation(invCtx, startTime, ratelimitDoneAt, moduleLoadedAt, executeDoneAt, "success", nil)
e.logSlowInvocation(invCtx, startTime, ratelimitDoneAt, moduleLoadedAt, executeDoneAt, instTiming.InstantiateNs, "success", nil)
return output, nil
}
@ -379,9 +388,9 @@ func (e *Engine) Execute(ctx context.Context, fn *Function, input []byte, invCtx
// Zero-valued phase timestamps mean the phase was never reached, which
// is itself signal — e.g. moduleLoadedAt=zero + executeDoneAt=zero with
// large totalMs means we blocked in rate-limit OR module-load.
func (e *Engine) logSlowInvocation(invCtx *InvocationContext, startTime, ratelimitDoneAt, moduleLoadedAt, executeDoneAt time.Time, status string, err error) {
func (e *Engine) logSlowInvocation(invCtx *InvocationContext, startTime, ratelimitDoneAt, moduleLoadedAt, executeDoneAt time.Time, instantiateNs int64, status string, err error) {
totalMs := time.Since(startTime).Milliseconds()
if totalMs < slowInvokeThreshold.Milliseconds() {
if totalMs < e.slowInvokeThreshold().Milliseconds() {
return
}
// Compute phase deltas. Use 0 for unreached phases so the log line
@ -396,6 +405,15 @@ func (e *Engine) logSlowInvocation(invCtx *InvocationContext, startTime, ratelim
if !executeDoneAt.IsZero() && !moduleLoadedAt.IsZero() {
executeMs = executeDoneAt.Sub(moduleLoadedAt).Milliseconds()
}
// Split execute into instantiate (TinyGo _start cold-start) vs run
// (handler logic). A count=0 read with instantiate_ms ≈ execute_ms and
// run_ms ≈ 0 is the bugboard #27 cold-start floor — the per-call fresh
// instantiation, not the handler, is the sink.
instantiateMs := instantiateNs / int64(time.Millisecond)
runMs := executeMs - instantiateMs
if runMs < 0 {
runMs = 0
}
fields := []zap.Field{
zap.String("namespace", invCtx.Namespace),
zap.String("function", invCtx.FunctionName),
@ -406,6 +424,8 @@ func (e *Engine) logSlowInvocation(invCtx *InvocationContext, startTime, ratelim
zap.Int64("ratelimit_ms", ratelimitMs),
zap.Int64("module_load_ms", moduleLoadMs),
zap.Int64("execute_ms", executeMs),
zap.Int64("instantiate_ms", instantiateMs),
zap.Int64("run_ms", runMs),
zap.String("invocation_status", status),
}
if err != nil {

View File

@ -30,7 +30,7 @@ func TestLogSlowInvocation_belowThresholdEmitsNothing(t *testing.T) {
invCtx := &InvocationContext{Namespace: "ns", FunctionName: "fast-fn"}
now := time.Now()
e.logSlowInvocation(invCtx, now, now.Add(1*time.Millisecond), now.Add(2*time.Millisecond), now.Add(100*time.Millisecond), "success", nil)
e.logSlowInvocation(invCtx, now, now.Add(1*time.Millisecond), now.Add(2*time.Millisecond), now.Add(100*time.Millisecond), 0, "success", nil)
if got := observed.Len(); got != 0 {
t.Errorf("fast invocation (100ms < 5s threshold) emitted %d log lines; want 0", got)
@ -51,12 +51,15 @@ func TestLogSlowInvocation_aboveThresholdEmitsBreakdown(t *testing.T) {
WSClientID: "ws-client-xyz",
}
// Simulate a 30s-class invocation that spent the bulk in execute.
// Simulate a 30s-class invocation that spent the bulk in execute, of which
// nearly all was cold-start instantiation (the bugboard #27 floor pattern):
// instantiate_ms ≈ execute_ms, run_ms ≈ 0.
start := time.Now().Add(-30 * time.Second)
ratelimitDone := start.Add(50 * time.Millisecond)
moduleLoaded := start.Add(150 * time.Millisecond)
executeDone := start.Add(30 * time.Second)
e.logSlowInvocation(invCtx, start, ratelimitDone, moduleLoaded, executeDone, "timeout", nil)
instantiateNs := (29*time.Second + 800*time.Millisecond).Nanoseconds()
e.logSlowInvocation(invCtx, start, ratelimitDone, moduleLoaded, executeDone, instantiateNs, "timeout", nil)
logs := observed.All()
if len(logs) != 1 {
@ -96,6 +99,18 @@ func TestLogSlowInvocation_aboveThresholdEmitsBreakdown(t *testing.T) {
if executeMs < 29000 || executeMs > 30000 {
t.Errorf("execute_ms = %d; want ~29900 (proves the phase-breakdown points at execute)", executeMs)
}
// The #27 cold-start split: instantiate dominates execute, run ≈ 0. This is
// the field AnChat needs to distinguish "fresh instantiate is the sink"
// from "the handler is slow".
instantiateMs, _ := contextMap["instantiate_ms"].(int64)
runMs, _ := contextMap["run_ms"].(int64)
if instantiateMs < 29000 || instantiateMs > 30000 {
t.Errorf("instantiate_ms = %d; want ~29800 (cold-start dominates execute)", instantiateMs)
}
if runMs < 0 || runMs > 500 {
t.Errorf("run_ms = %d; want ~100 (handler logic is trivial; cold-start is the sink)", runMs)
}
}
func TestLogSlowInvocation_zeroPhaseTimestampsMeanUnreached(t *testing.T) {
@ -112,7 +127,7 @@ func TestLogSlowInvocation_zeroPhaseTimestampsMeanUnreached(t *testing.T) {
start := time.Now().Add(-10 * time.Second)
ratelimitDone := start.Add(100 * time.Millisecond)
// moduleLoadedAt and executeDoneAt left as zero — module-load failed
e.logSlowInvocation(invCtx, start, ratelimitDone, time.Time{}, time.Time{}, "module-load-failed", nil)
e.logSlowInvocation(invCtx, start, ratelimitDone, time.Time{}, time.Time{}, 0, "module-load-failed", nil)
logs := observed.All()
if len(logs) != 1 {

View File

@ -6,12 +6,36 @@ import (
cryptorand "crypto/rand"
"encoding/json"
"fmt"
"time"
"github.com/tetratelabs/wazero"
"github.com/tetratelabs/wazero/api"
"go.uber.org/zap"
)
// InstantiateTiming captures how long the per-invocation wazero
// InstantiateModule call took (running TinyGo _start / package init). It rides
// the ctx so the engine's slow-invoke diagnostic can split the execute phase
// into cold-start (instantiate) vs handler work (run) — the distinction that
// pins the bugboard #27 cold-start floor. Nil collector = not measured.
type InstantiateTiming struct {
InstantiateNs int64
}
type instantiateTimingKey struct{}
// WithInstantiateTiming returns a ctx carrying a fresh InstantiateTiming that
// ExecuteModule will fill in. The caller reads it back after ExecuteModule.
func WithInstantiateTiming(ctx context.Context) (context.Context, *InstantiateTiming) {
t := &InstantiateTiming{}
return context.WithValue(ctx, instantiateTimingKey{}, t), t
}
func instantiateTimingFrom(ctx context.Context) *InstantiateTiming {
t, _ := ctx.Value(instantiateTimingKey{}).(*InstantiateTiming)
return t
}
// Executor handles WASM module execution.
type Executor struct {
runtime wazero.Runtime
@ -101,8 +125,14 @@ func (e *Executor) ExecuteModule(ctx context.Context, compiled wazero.CompiledMo
}
}
// Instantiate and run the module (WASI _start will be called automatically)
// Instantiate and run the module (WASI _start will be called automatically).
// Time the instantiate so the engine can attribute cold-start vs handler
// work (bugboard #27 cold-start floor); no-op when no collector is attached.
instStart := time.Now()
instance, err := e.runtime.InstantiateModule(ctx, compiled, moduleConfig)
if t := instantiateTimingFrom(ctx); t != nil {
t.InstantiateNs = time.Since(instStart).Nanoseconds()
}
if err != nil {
// Check if stderr has any output
if stderr.Len() > 0 {

View File

@ -8,6 +8,7 @@ import (
"github.com/tetratelabs/wazero"
"github.com/tetratelabs/wazero/imports/wasi_snapshot_preview1"
"go.uber.org/zap"
)
// Bugboard #120 — wazero defaults to a DETERMINISTIC (zero-seed) RNG source.
@ -179,3 +180,31 @@ func TestModuleConfig_randWithoutFix_demoDeterministic(t *testing.T) {
}
// Determinism confirmed → fix is meaningful.
}
// Bugboard #27 instrumentation: ExecuteModule must record how long the
// per-invocation InstantiateModule (TinyGo _start cold-start) took into the
// ctx-attached collector, so the engine can split the execute phase into
// cold-start vs handler work. Without an attached collector it must be a no-op.
func TestExecuteModule_recordsInstantiateTiming(t *testing.T) {
ctx := context.Background()
runtime := wazero.NewRuntime(ctx)
defer runtime.Close(ctx)
if _, err := wasi_snapshot_preview1.Instantiate(ctx, runtime); err != nil {
t.Fatalf("instantiate WASI: %v", err)
}
compiled, err := runtime.CompileModule(ctx, randProbeWasm)
if err != nil {
t.Fatalf("compile probe wasm: %v", err)
}
defer compiled.Close(ctx)
ex := NewExecutor(runtime, zap.NewNop(), 0)
tctx, timing := WithInstantiateTiming(ctx)
if _, err := ex.ExecuteModule(tctx, compiled, "probe", nil, nil, nil); err != nil {
t.Fatalf("ExecuteModule: %v", err)
}
if timing.InstantiateNs <= 0 {
t.Errorf("InstantiateNs = %d; want > 0 (instantiate duration must be recorded)", timing.InstantiateNs)
}
}