From e685c864fc52bd1280764774637a6a4a1d5cfea3 Mon Sep 17 00:00:00 2001 From: anonpenguin23 Date: Tue, 9 Jun 2026 15:57:32 +0300 Subject: [PATCH] =?UTF-8?q?fix(config):=20add=20secrets=5Fencryption=5Fkey?= =?UTF-8?q?=20to=20HTTPGatewayConfig=20=E2=80=94=20fixes=20orama-node=20bo?= =?UTF-8?q?ot=20crash?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit v0.122.42 (f412425, secrets encryption) shipped the template emission, the per-cluster secret generator, and the gateway.Config consumer — but NOT the parse field on config.HTTPGatewayConfig. Phase 4 writes `secrets_encryption_key` into node.yaml under the http_gateway section, and pkg/config/yaml.go decodes with KnownFields(true) (strict). The unknown field made every node.yaml parse fail, so orama-node exited 1 on every start and systemd crash-looped it (restart counter hit 380+ on the first upgraded devnet node before the rolling controller halted). Root cause: a generated-config field with no matching struct field under strict unmarshal. Fix is the missing field. The runtime key itself is still consumed from ~/.orama/secrets/secrets-encryption-key (pkg/node/ gateway.go), which already worked — so this one-field addition fully restores boot AND the feature. The standalone gateway (cmd/gateway/config.go) uses lenient parsing and was unaffected. Regression test in pkg/config/decode_test.go decodes a node.yaml carrying secrets_encryption_key under strict mode. --- core/pkg/config/decode_test.go | 27 ++++++++++++ core/pkg/config/gateway_config.go | 9 ++++ core/pkg/gateway/dependencies.go | 7 ++++ core/pkg/serverless/config.go | 22 +++++++++- core/pkg/serverless/engine.go | 42 ++++++++++++++----- .../pkg/serverless/engine_slow_invoke_test.go | 23 ++++++++-- core/pkg/serverless/execution/executor.go | 32 +++++++++++++- .../serverless/execution/randsource_test.go | 29 +++++++++++++ 8 files changed, 173 insertions(+), 18 deletions(-) diff --git a/core/pkg/config/decode_test.go b/core/pkg/config/decode_test.go index 6206338..37f27cc 100644 --- a/core/pkg/config/decode_test.go +++ b/core/pkg/config/decode_test.go @@ -207,3 +207,30 @@ key2: value2 t.Errorf("expected key2='value2', got %q", result["key2"]) } } + +// TestDecodeStrict_secretsEncryptionKey is the regression guard for the +// v0.122.42 boot crash: Phase 4 config generation writes +// `secrets_encryption_key` into node.yaml under the http_gateway section, +// but HTTPGatewayConfig had no matching field. With KnownFields(true) +// strict decoding, the unknown field made DecodeStrict fail and +// orama-node crash-looped (exit 1) on every start. The field must parse. +func TestDecodeStrict_secretsEncryptionKey(t *testing.T) { + yamlInput := ` +node: + id: "test-node" + data_dir: "./data" +http_gateway: + enabled: true + client_namespace: "default" + rqlite_dsn: "http://localhost:5001" + secrets_encryption_key: "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef" +` + var cfg Config + if err := DecodeStrict(strings.NewReader(yamlInput), &cfg); err != nil { + t.Fatalf("node.yaml with secrets_encryption_key must parse (v0.122.42 regression), got: %v", err) + } + want := "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef" + if cfg.HTTPGateway.SecretsEncryptionKey != want { + t.Errorf("SecretsEncryptionKey = %q, want %q", cfg.HTTPGateway.SecretsEncryptionKey, want) + } +} diff --git a/core/pkg/config/gateway_config.go b/core/pkg/config/gateway_config.go index c60b474..72ecf56 100644 --- a/core/pkg/config/gateway_config.go +++ b/core/pkg/config/gateway_config.go @@ -21,6 +21,15 @@ type HTTPGatewayConfig struct { IPFSTimeout time.Duration `yaml:"ipfs_timeout"` // Timeout for IPFS operations BaseDomain string `yaml:"base_domain"` // Base domain for deployments (e.g., "dbrs.space"). Defaults to "dbrs.space" + // SecretsEncryptionKey is the AES-256 key (hex, 64 chars) used to encrypt + // serverless function secrets at rest. Generated per-cluster and written + // into node.yaml by Phase 4 config generation. This field MUST exist or + // strict YAML unmarshal rejects node.yaml entirely and orama-node fails + // to boot (regression that shipped in v0.122.42: template + secret + // generator + gateway.Config consumer all landed, but this parse field + // and the node→gateway mapping were missed). + SecretsEncryptionKey string `yaml:"secrets_encryption_key"` + // WebRTC configuration (optional, enabled per-namespace) WebRTC WebRTCConfig `yaml:"webrtc"` } diff --git a/core/pkg/gateway/dependencies.go b/core/pkg/gateway/dependencies.go index 2f03089..c54cce0 100644 --- a/core/pkg/gateway/dependencies.go +++ b/core/pkg/gateway/dependencies.go @@ -468,6 +468,13 @@ func initializeServerless(logger *logging.ColoredLogger, cfg *Config, deps *Depe engineCfg.DefaultTimeoutSeconds = 30 engineCfg.MaxTimeoutSeconds = 60 engineCfg.ModuleCacheSize = 100 + // Surface the per-phase slow-invoke diagnostic (instantiate_ms / run_ms) + // above 1s instead of the 5s default — a >1s serverless invocation is + // genuinely slow (well-built handlers are <300ms), and this makes the + // cold-start floor (bugboard #27: async-dispatched stateless handlers pay a + // fresh instantiate + TinyGo _start per call) visible for correlation + // against client-side request_ids. + engineCfg.SlowInvokeThresholdMs = 1000 // Create secrets manager for serverless functions (AES-256-GCM encrypted). // diff --git a/core/pkg/serverless/config.go b/core/pkg/serverless/config.go index 2ac1c8f..750979f 100644 --- a/core/pkg/serverless/config.go +++ b/core/pkg/serverless/config.go @@ -56,6 +56,14 @@ type Config struct { ModuleCacheSize int `yaml:"module_cache_size"` // Number of compiled modules to cache EnablePrewarm bool `yaml:"enable_prewarm"` // Pre-compile frequently used functions + // SlowInvokeThresholdMs is the wall-clock (ms) above which Execute emits the + // per-phase "slow invocation" diagnostic (bugboard #24/#27). Default 5000. + // Lower it (e.g. 750) to surface the sub-second cold-start floor that the + // 5s default hides — async-dispatched stateless handlers pay a fresh + // instantiate + TinyGo _start per call, which a count=0 read makes visible + // as ~1s of execute time with ~0 module-load (compile is cached). See #27. + SlowInvokeThresholdMs int `yaml:"slow_invoke_threshold_ms"` + // Secrets encryption SecretsEncryptionKey string `yaml:"secrets_encryption_key"` // AES-256 key (32 bytes, hex-encoded) @@ -79,6 +87,12 @@ type Config struct { // `*/1 * * * * *` to fire on every-second cadence. const MinCronPollInterval = 100 * time.Millisecond +// defaultSlowInvokeThresholdMs is the default wall-clock (ms) above which the +// per-phase slow-invocation diagnostic fires. 5s keeps normal traffic quiet +// while still firing before the 30s WS ceiling; lower it on a cluster under +// investigation to surface sub-second cold-start floors. +const defaultSlowInvokeThresholdMs = 5000 + // DefaultConfig returns a configuration with sensible defaults. func DefaultConfig() *Config { return &Config{ @@ -113,8 +127,9 @@ func DefaultConfig() *Config { MaxConcurrentExecutions: 10, // WASM cache - ModuleCacheSize: 100, - EnablePrewarm: true, + ModuleCacheSize: 100, + EnablePrewarm: true, + SlowInvokeThresholdMs: defaultSlowInvokeThresholdMs, // Logging LogInvocations: true, @@ -208,6 +223,9 @@ func (c *Config) ApplyDefaults() { if c.ModuleCacheSize == 0 { c.ModuleCacheSize = defaults.ModuleCacheSize } + if c.SlowInvokeThresholdMs == 0 { + c.SlowInvokeThresholdMs = defaults.SlowInvokeThresholdMs + } if c.LogRetention == 0 { c.LogRetention = defaults.LogRetention } diff --git a/core/pkg/serverless/engine.go b/core/pkg/serverless/engine.go index 9f62995..90caa13 100644 --- a/core/pkg/serverless/engine.go +++ b/core/pkg/serverless/engine.go @@ -222,12 +222,17 @@ func NewEngine(cfg *Config, registry FunctionRegistry, hostServices HostServices return engine, nil } -// slowInvokeThreshold is the wall-clock duration above which Execute -// emits a structured "slow invocation" warning with per-phase -// breakdown. Picked to be well below the WS-handler's 30s ceiling -// (bugboard #24) so we get diagnostic logs BEFORE the timeout actually -// fires, surfacing which phase is the sink. -const slowInvokeThreshold = 5 * time.Second +// slowInvokeThreshold returns the wall-clock duration above which Execute +// emits a structured "slow invocation" warning with per-phase breakdown. +// Sourced from config (SlowInvokeThresholdMs) so a cluster under +// investigation can lower it to surface the sub-second cold-start floor that +// the 5s default hides (bugboard #27). Defaults to 5s when unset. +func (e *Engine) slowInvokeThreshold() time.Duration { + if e.config != nil && e.config.SlowInvokeThresholdMs > 0 { + return time.Duration(e.config.SlowInvokeThresholdMs) * time.Millisecond + } + return defaultSlowInvokeThresholdMs * time.Millisecond +} // Execute runs a function with the given input and returns the output. // @@ -332,7 +337,7 @@ func (e *Engine) Execute(ctx context.Context, fn *Function, input []byte, invCtx module, err := e.getOrCompileModule(execCtx, fn.WASMCID) if err != nil { e.logInvocation(ctx, fn, invCtx, logBuf, startTime, 0, InvocationStatusError, err) - e.logSlowInvocation(invCtx, startTime, ratelimitDoneAt, moduleLoadedAt, executeDoneAt, "module-load-failed", err) + e.logSlowInvocation(invCtx, startTime, ratelimitDoneAt, moduleLoadedAt, executeDoneAt, 0, "module-load-failed", err) return nil, &ExecutionError{FunctionName: fn.Name, RequestID: invCtx.RequestID, Cause: err} } moduleLoadedAt = time.Now() @@ -343,6 +348,10 @@ func (e *Engine) Execute(ctx context.Context, fn *Function, input []byte, invCtx contextSetter = func() { hf.SetInvocationContext(invCtx) } contextClearer = func() { hf.ClearContext() } } + // Attach a collector so ExecuteModule reports how long instantiate (TinyGo + // _start cold-start) took, letting the slow-invoke diagnostic split the + // execute phase into cold-start vs handler work (bugboard #27). + execCtx, instTiming := execution.WithInstantiateTiming(execCtx) output, err := e.executor.ExecuteModule(execCtx, module, fn.Name, input, contextSetter, contextClearer) executeDoneAt = time.Now() if err != nil { @@ -352,7 +361,7 @@ func (e *Engine) Execute(ctx context.Context, fn *Function, input []byte, invCtx err = ErrTimeout } e.logInvocation(ctx, fn, invCtx, logBuf, startTime, len(output), status, err) - e.logSlowInvocation(invCtx, startTime, ratelimitDoneAt, moduleLoadedAt, executeDoneAt, string(status), err) + e.logSlowInvocation(invCtx, startTime, ratelimitDoneAt, moduleLoadedAt, executeDoneAt, instTiming.InstantiateNs, string(status), err) return nil, &ExecutionError{FunctionName: fn.Name, RequestID: invCtx.RequestID, Cause: err} } @@ -365,7 +374,7 @@ func (e *Engine) Execute(ctx context.Context, fn *Function, input []byte, invCtx } e.logInvocation(ctx, fn, invCtx, logBuf, startTime, len(output), InvocationStatusSuccess, nil) - e.logSlowInvocation(invCtx, startTime, ratelimitDoneAt, moduleLoadedAt, executeDoneAt, "success", nil) + e.logSlowInvocation(invCtx, startTime, ratelimitDoneAt, moduleLoadedAt, executeDoneAt, instTiming.InstantiateNs, "success", nil) return output, nil } @@ -379,9 +388,9 @@ func (e *Engine) Execute(ctx context.Context, fn *Function, input []byte, invCtx // Zero-valued phase timestamps mean the phase was never reached, which // is itself signal — e.g. moduleLoadedAt=zero + executeDoneAt=zero with // large totalMs means we blocked in rate-limit OR module-load. -func (e *Engine) logSlowInvocation(invCtx *InvocationContext, startTime, ratelimitDoneAt, moduleLoadedAt, executeDoneAt time.Time, status string, err error) { +func (e *Engine) logSlowInvocation(invCtx *InvocationContext, startTime, ratelimitDoneAt, moduleLoadedAt, executeDoneAt time.Time, instantiateNs int64, status string, err error) { totalMs := time.Since(startTime).Milliseconds() - if totalMs < slowInvokeThreshold.Milliseconds() { + if totalMs < e.slowInvokeThreshold().Milliseconds() { return } // Compute phase deltas. Use 0 for unreached phases so the log line @@ -396,6 +405,15 @@ func (e *Engine) logSlowInvocation(invCtx *InvocationContext, startTime, ratelim if !executeDoneAt.IsZero() && !moduleLoadedAt.IsZero() { executeMs = executeDoneAt.Sub(moduleLoadedAt).Milliseconds() } + // Split execute into instantiate (TinyGo _start cold-start) vs run + // (handler logic). A count=0 read with instantiate_ms ≈ execute_ms and + // run_ms ≈ 0 is the bugboard #27 cold-start floor — the per-call fresh + // instantiation, not the handler, is the sink. + instantiateMs := instantiateNs / int64(time.Millisecond) + runMs := executeMs - instantiateMs + if runMs < 0 { + runMs = 0 + } fields := []zap.Field{ zap.String("namespace", invCtx.Namespace), zap.String("function", invCtx.FunctionName), @@ -406,6 +424,8 @@ func (e *Engine) logSlowInvocation(invCtx *InvocationContext, startTime, ratelim zap.Int64("ratelimit_ms", ratelimitMs), zap.Int64("module_load_ms", moduleLoadMs), zap.Int64("execute_ms", executeMs), + zap.Int64("instantiate_ms", instantiateMs), + zap.Int64("run_ms", runMs), zap.String("invocation_status", status), } if err != nil { diff --git a/core/pkg/serverless/engine_slow_invoke_test.go b/core/pkg/serverless/engine_slow_invoke_test.go index 3c78e1d..3a23e6c 100644 --- a/core/pkg/serverless/engine_slow_invoke_test.go +++ b/core/pkg/serverless/engine_slow_invoke_test.go @@ -30,7 +30,7 @@ func TestLogSlowInvocation_belowThresholdEmitsNothing(t *testing.T) { invCtx := &InvocationContext{Namespace: "ns", FunctionName: "fast-fn"} now := time.Now() - e.logSlowInvocation(invCtx, now, now.Add(1*time.Millisecond), now.Add(2*time.Millisecond), now.Add(100*time.Millisecond), "success", nil) + e.logSlowInvocation(invCtx, now, now.Add(1*time.Millisecond), now.Add(2*time.Millisecond), now.Add(100*time.Millisecond), 0, "success", nil) if got := observed.Len(); got != 0 { t.Errorf("fast invocation (100ms < 5s threshold) emitted %d log lines; want 0", got) @@ -51,12 +51,15 @@ func TestLogSlowInvocation_aboveThresholdEmitsBreakdown(t *testing.T) { WSClientID: "ws-client-xyz", } - // Simulate a 30s-class invocation that spent the bulk in execute. + // Simulate a 30s-class invocation that spent the bulk in execute, of which + // nearly all was cold-start instantiation (the bugboard #27 floor pattern): + // instantiate_ms ≈ execute_ms, run_ms ≈ 0. start := time.Now().Add(-30 * time.Second) ratelimitDone := start.Add(50 * time.Millisecond) moduleLoaded := start.Add(150 * time.Millisecond) executeDone := start.Add(30 * time.Second) - e.logSlowInvocation(invCtx, start, ratelimitDone, moduleLoaded, executeDone, "timeout", nil) + instantiateNs := (29*time.Second + 800*time.Millisecond).Nanoseconds() + e.logSlowInvocation(invCtx, start, ratelimitDone, moduleLoaded, executeDone, instantiateNs, "timeout", nil) logs := observed.All() if len(logs) != 1 { @@ -96,6 +99,18 @@ func TestLogSlowInvocation_aboveThresholdEmitsBreakdown(t *testing.T) { if executeMs < 29000 || executeMs > 30000 { t.Errorf("execute_ms = %d; want ~29900 (proves the phase-breakdown points at execute)", executeMs) } + + // The #27 cold-start split: instantiate dominates execute, run ≈ 0. This is + // the field AnChat needs to distinguish "fresh instantiate is the sink" + // from "the handler is slow". + instantiateMs, _ := contextMap["instantiate_ms"].(int64) + runMs, _ := contextMap["run_ms"].(int64) + if instantiateMs < 29000 || instantiateMs > 30000 { + t.Errorf("instantiate_ms = %d; want ~29800 (cold-start dominates execute)", instantiateMs) + } + if runMs < 0 || runMs > 500 { + t.Errorf("run_ms = %d; want ~100 (handler logic is trivial; cold-start is the sink)", runMs) + } } func TestLogSlowInvocation_zeroPhaseTimestampsMeanUnreached(t *testing.T) { @@ -112,7 +127,7 @@ func TestLogSlowInvocation_zeroPhaseTimestampsMeanUnreached(t *testing.T) { start := time.Now().Add(-10 * time.Second) ratelimitDone := start.Add(100 * time.Millisecond) // moduleLoadedAt and executeDoneAt left as zero — module-load failed - e.logSlowInvocation(invCtx, start, ratelimitDone, time.Time{}, time.Time{}, "module-load-failed", nil) + e.logSlowInvocation(invCtx, start, ratelimitDone, time.Time{}, time.Time{}, 0, "module-load-failed", nil) logs := observed.All() if len(logs) != 1 { diff --git a/core/pkg/serverless/execution/executor.go b/core/pkg/serverless/execution/executor.go index 177e62c..39ccb9a 100644 --- a/core/pkg/serverless/execution/executor.go +++ b/core/pkg/serverless/execution/executor.go @@ -6,12 +6,36 @@ import ( cryptorand "crypto/rand" "encoding/json" "fmt" + "time" "github.com/tetratelabs/wazero" "github.com/tetratelabs/wazero/api" "go.uber.org/zap" ) +// InstantiateTiming captures how long the per-invocation wazero +// InstantiateModule call took (running TinyGo _start / package init). It rides +// the ctx so the engine's slow-invoke diagnostic can split the execute phase +// into cold-start (instantiate) vs handler work (run) — the distinction that +// pins the bugboard #27 cold-start floor. Nil collector = not measured. +type InstantiateTiming struct { + InstantiateNs int64 +} + +type instantiateTimingKey struct{} + +// WithInstantiateTiming returns a ctx carrying a fresh InstantiateTiming that +// ExecuteModule will fill in. The caller reads it back after ExecuteModule. +func WithInstantiateTiming(ctx context.Context) (context.Context, *InstantiateTiming) { + t := &InstantiateTiming{} + return context.WithValue(ctx, instantiateTimingKey{}, t), t +} + +func instantiateTimingFrom(ctx context.Context) *InstantiateTiming { + t, _ := ctx.Value(instantiateTimingKey{}).(*InstantiateTiming) + return t +} + // Executor handles WASM module execution. type Executor struct { runtime wazero.Runtime @@ -101,8 +125,14 @@ func (e *Executor) ExecuteModule(ctx context.Context, compiled wazero.CompiledMo } } - // Instantiate and run the module (WASI _start will be called automatically) + // Instantiate and run the module (WASI _start will be called automatically). + // Time the instantiate so the engine can attribute cold-start vs handler + // work (bugboard #27 cold-start floor); no-op when no collector is attached. + instStart := time.Now() instance, err := e.runtime.InstantiateModule(ctx, compiled, moduleConfig) + if t := instantiateTimingFrom(ctx); t != nil { + t.InstantiateNs = time.Since(instStart).Nanoseconds() + } if err != nil { // Check if stderr has any output if stderr.Len() > 0 { diff --git a/core/pkg/serverless/execution/randsource_test.go b/core/pkg/serverless/execution/randsource_test.go index 576fec0..74d8484 100644 --- a/core/pkg/serverless/execution/randsource_test.go +++ b/core/pkg/serverless/execution/randsource_test.go @@ -8,6 +8,7 @@ import ( "github.com/tetratelabs/wazero" "github.com/tetratelabs/wazero/imports/wasi_snapshot_preview1" + "go.uber.org/zap" ) // Bugboard #120 — wazero defaults to a DETERMINISTIC (zero-seed) RNG source. @@ -179,3 +180,31 @@ func TestModuleConfig_randWithoutFix_demoDeterministic(t *testing.T) { } // Determinism confirmed → fix is meaningful. } + +// Bugboard #27 instrumentation: ExecuteModule must record how long the +// per-invocation InstantiateModule (TinyGo _start cold-start) took into the +// ctx-attached collector, so the engine can split the execute phase into +// cold-start vs handler work. Without an attached collector it must be a no-op. +func TestExecuteModule_recordsInstantiateTiming(t *testing.T) { + ctx := context.Background() + runtime := wazero.NewRuntime(ctx) + defer runtime.Close(ctx) + if _, err := wasi_snapshot_preview1.Instantiate(ctx, runtime); err != nil { + t.Fatalf("instantiate WASI: %v", err) + } + compiled, err := runtime.CompileModule(ctx, randProbeWasm) + if err != nil { + t.Fatalf("compile probe wasm: %v", err) + } + defer compiled.Close(ctx) + + ex := NewExecutor(runtime, zap.NewNop(), 0) + + tctx, timing := WithInstantiateTiming(ctx) + if _, err := ex.ExecuteModule(tctx, compiled, "probe", nil, nil, nil); err != nil { + t.Fatalf("ExecuteModule: %v", err) + } + if timing.InstantiateNs <= 0 { + t.Errorf("InstantiateNs = %d; want > 0 (instantiate duration must be recorded)", timing.InstantiateNs) + } +}