commit for version used in evaluation of thesis

This commit is contained in:
Patryk Hegenberg 2026-03-29 10:03:18 +02:00
commit 72635dc7b9
27 changed files with 6084 additions and 0 deletions

50
Makefile Normal file
View file

@ -0,0 +1,50 @@
BINARY := guenther
BUILD_DIR := build
CMD := ./cmd/pipeline/main.go
CONFIG := configs/default.yaml
GO_IMAGE := golang:bookworm
BUILD_TAGS := duckdb_arrow
LDFLAGS := -s -w
GO_BUILD_FLAGS := -tags=$(BUILD_TAGS) -buildvcs=false -ldflags='$(LDFLAGS)'
# ── Targets ───────────────────────────────────────────────────────────────────
.PHONY: all build build-local test clean run help
all: build
## build: Build the binary inside a Docker container (no local toolchain needed)
build:
@mkdir -p $(BUILD_DIR)
docker run --rm \
-v $(PWD):/app:Z \
-w /app \
$(GO_IMAGE) \
sh -c "apt-get update -qq && \
apt-get install -y -qq gcc libc6-dev && \
CGO_ENABLED=1 go build $(GO_BUILD_FLAGS) -o $(BUILD_DIR)/$(BINARY) $(CMD) && \
echo BUILD_OK" \
2>&1
## build-local: Build the binary using the local Go toolchain (requires gcc)
build-local:
@mkdir -p $(BUILD_DIR)
CGO_ENABLED=1 go build $(GO_BUILD_FLAGS) -o $(BUILD_DIR)/$(BINARY) $(CMD)
## test: Run all tests (requires local Go toolchain with gcc)
test:
CGO_ENABLED=1 go test -v -tags=$(BUILD_TAGS) ./...
## run: Run the pipeline with the default config (binary must be built first)
run: $(BUILD_DIR)/$(BINARY)
./$(BUILD_DIR)/$(BINARY) -config $(CONFIG)
## clean: Remove build artefacts
clean:
rm -rf $(BUILD_DIR)
## help: Show this help message
help:
@grep -E '^## ' $(MAKEFILE_LIST) | sed 's/^## / /'

212
README.md Normal file
View file

@ -0,0 +1,212 @@
# guenther
A streaming anomaly detection pipeline for Managed-File-Transfer (MFT) infrastructure.
guenther ingests system metrics and application logs in real time, extracts structured
feature vectors per time window, and scores them with an ensemble of unsupervised
detectors — without any labelled training data.
---
## How it works
```
┌─────────────────────────────────────────────────────────────┐
│ Ingestion │
│ MetricCollector (/proc) LogCollector (inotify + Drain3) │
│ SystemctlCollector (service states) │
└────────────────────┬────────────────────────────────────────┘
│ channels (backpressure)
┌────────────────────▼────────────────────────────────────────┐
│ Transformation │
│ TransformEngine 30 s tumbling windows via DuckDB │
│ 45 base features + N Drain3 parameter aggregates │
└────────────────────┬────────────────────────────────────────┘
┌────────────────────▼────────────────────────────────────────┐
│ Detection │
│ EnsembleDetector (RRCF fast/mid/slow · COPOD · MAD) │
│ SEAD online weight adaptation · auto-scaling (3 stages) │
└────────────────────┬────────────────────────────────────────┘
anomalies.jsonl
```
### Packages
| Path | Responsibility |
| -------------------- | -------------------------------------------------------------------------------- |
| `cmd/pipeline` | Entry point, wiring, graceful shutdown |
| `internal/collector` | `MetricCollector` (`/proc`), `LogCollector` (inotify), `SystemctlCollector` |
| `internal/transform` | `TransformEngine` — DuckDB windowed aggregation |
| `internal/detect` | `EnsembleDetector`, RRCF, COPOD, MAD, IsolationForest, SEAD, `ScalingController` |
| `internal/drain3` | Masking / parameter extraction wrapper around Drain3 |
| `internal/config` | YAML config loading and regex compilation |
| `internal/health` | `HealthMonitor` — per-stage counters |
| `pkg/types` | Shared types: `LogEvent`, `MetricSnapshot`, `FeatureVector`, `AnomalyResult` |
---
## Requirements
| Dependency | Notes |
| --------------- | ------------------------------------------------------------ |
| Docker | Required for the containerised build (recommended) |
| Go ≥ 1.25 | Only needed for local builds |
| gcc / libc6-dev | CGO is required by `go-duckdb` |
| Linux | Metric collection reads `/proc`; not supported on other OSes |
---
## Building
### Docker (recommended — no local toolchain needed)
```bash
make build
```
The binary is written to `build/guenther`.
### Local (requires Go + gcc)
```bash
make build-local
```
---
## Running
```bash
./build/guenther -config configs/default.yaml
```
guenther shuts down cleanly on `SIGINT` or `SIGTERM`.
---
## Testing
```bash
make test
```
---
## Configuration
guenther is configured via a single YAML file (default: `configs/default.yaml`).
```yaml
ingestion:
log_path: "/path/to/log/file/transfer.log" # file to tail
net_interface: "ens4" # interface for /proc/net/dev
disk_device: "vda1" # device for /proc/diskstats
systemctl_services:
- service1.service
- service2.service
transformation:
window_size: "30s" # tumbling window length
db_path: "data/pipeline.duckdb" # DuckDB file (use :memory: for ephemeral)
drain:
depth: 4
sim_threshold: 0.4
max_children: 100
max_clusters: 1000
masking_patterns: # applied in order before template mining
- name: "uuid"
pattern: '\b[0-9a-fA-F]{8}-...\b'
replace: "<UUID>"
type: "string"
# ... see configs/default.yaml for the full set
detector:
method: "ensemble" # fallback when ensemble.enabled = false
ensemble:
enabled: true
method: "sead" # avg | max | median | sead
contamination: 0.15
sead:
eta: 0.1
lambda: 0.01
auto_scaling:
enabled: true
high_threshold: 75.0 # CPU % → switch to mid detector
critical_threshold: 90.0 # CPU % → switch to fast detector
down_threshold: 50.0
high_duration: 90.0 # seconds load must persist before scaling
critical_duration: 120.0
down_duration: 120.0
rrcf_variants:
fast: { num_trees: 50, tree_size: 32, threshold_percentile: 0.85 }
mid: { num_trees: 150, tree_size: 64, threshold_percentile: 0.85 }
slow: { num_trees: 200, tree_size: 128, threshold_percentile: 0.85 }
copod:
buffer_size: 50
threshold: 0.3
mad:
threshold: 3.5
calibration_size: 50
output:
feature_log_path: "logs/features.jsonl"
anomaly_log_path: "logs/anomalies.jsonl"
```
### Masking pattern types
Patterns with `type: float` extract a named parameter into `FeatureVector.ParamAvg`;
patterns with `type: string` replace the match in-place before template mining.
Named patterns (`name != ""`) are aggregated as features per window.
---
## Output
**`logs/anomalies.jsonl`** — one JSON object per scored window:
```json
{
"timestamp": "2026-01-15T14:32:00Z",
"score": 0.8721,
"is_anomaly": true,
"confidence": 0.91,
"method": "sead_ensemble",
"details": "rrcf_slow=0.91 copod=0.83 mad=0.78"
}
```
**`logs/features.jsonl`** — raw feature vectors for offline analysis (optional).
---
## Project layout
```
guenther/
├── cmd/
│ └── pipeline/
│ └── main.go
├── internal/
│ ├── collector/
│ ├── config/
│ ├── detect/
│ ├── drain3/
│ ├── health/
│ └── transform/
├── pkg/
│ └── types/
├── configs/
│ └── default.yaml
├── build/ # created by `make build`
├── Makefile
└── README.md
```
---
## License
This project was developed as part of a Bachelor's thesis.

294
cmd/pipeline/main.go Normal file
View file

@ -0,0 +1,294 @@
// Command pipeline is the entry point for the MFT anomaly detection pipeline.
//
// Startup order:
// 1. Load and compile config (masking patterns → *regexp.Regexp).
// 2. Allocate channels with fixed capacities to enable backpressure.
// 3. Start HealthMonitor.
// 4. Start collectors (MetricCollector, LogCollector).
// 5. Start TransformEngine (DuckDB, schema, pre-compiled query).
// 6. Start DetectionLayer.
// 7. Start anomaly sink goroutine.
// 8. Wait for SIGINT / SIGTERM.
// 9. Graceful shutdown in reverse order.
package main
import (
"bufio"
"context"
"encoding/json"
"flag"
"fmt"
"log"
"os"
"os/signal"
"sync"
"syscall"
"time"
"codeberg.org/pata1704/guenther/internal/collector"
"codeberg.org/pata1704/guenther/internal/config"
"codeberg.org/pata1704/guenther/internal/detect"
"codeberg.org/pata1704/guenther/internal/health"
"codeberg.org/pata1704/guenther/internal/transform"
"codeberg.org/pata1704/guenther/pkg/types"
)
func main() {
cfgPath := flag.String("config", "configs/default.yaml", "path to config file")
flag.Parse()
cfg, err := config.LoadConfig(*cfgPath)
if err != nil {
log.Fatalf("load config %q: %v", *cfgPath, err)
}
if err := cfg.Compile(); err != nil {
log.Fatalf("compile masking patterns: %v", err)
}
ctx, cancel := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM)
defer cancel()
logChan := make(chan types.LogEvent, 1_000)
metricChan := make(chan types.MetricSnapshot, 100)
serviceStatusChan := make(chan types.ServiceStatus, 100)
featureChan := make(chan types.FeatureVector, 10)
anomalyChan := make(chan types.AnomalyResult, 50)
hm := health.NewHealthMonitor()
hm.Start(ctx, 5*time.Second)
metricColl := collector.NewMetricCollector(
metricChan, hm.Chan(),
time.Second,
cfg.Ingestion.NetInterface,
cfg.Ingestion.DiskDevice,
)
logColl := collector.NewLogCollector(cfg, logChan, hm.Chan())
sysColl := collector.NewSystemctlCollector(
cfg.Ingestion.SystemctlServices,
5*time.Second,
serviceStatusChan,
hm.Chan(),
)
metricColl.Start(ctx)
if err := logColl.Start(ctx); err != nil {
log.Fatalf("start log collector: %v", err)
}
sysColl.Start(ctx)
engine, err := transform.NewTransformEngine(cfg, logChan, metricChan, serviceStatusChan, featureChan, hm.Chan())
if err != nil {
log.Fatalf("create transform engine: %v", err)
}
engine.Start(ctx)
detector, err := buildDetector(cfg)
if err != nil {
log.Fatalf("build detector: %v", err)
}
detLayer := detect.NewDetectionLayer(detector, featureChan, anomalyChan, hm.Chan())
if cfg.Detection.AutoScaling.Enabled {
if sd, ok := detector.(*detect.SwitchableDetector); ok {
sc := detect.NewScalingController(
sd,
cfg.Detection.AutoScaling.HighThreshold,
cfg.Detection.AutoScaling.CritThreshold,
cfg.Detection.AutoScaling.DownThreshold,
cfg.Detection.AutoScaling.HighDuration,
cfg.Detection.AutoScaling.CritDuration,
cfg.Detection.AutoScaling.DownDuration,
)
detLayer.SetScalingController(sc)
log.Println("detector: auto-scaling enabled")
} else {
log.Println("warning: auto-scaling requested but detector is not switchable (requires SEAD ensemble)")
}
}
detLayer.Start(ctx)
anomalyLog := openLog(cfg.Output.AnomalyLogPath, "anomaly log")
if anomalyLog != nil {
defer anomalyLog.Close()
}
anomalyWriter := maybeWriter(anomalyLog)
var sinkWg sync.WaitGroup
sinkWg.Add(1)
go func() {
defer sinkWg.Done()
for res := range anomalyChan {
writeJSON(anomalyWriter, res)
if res.IsAnomaly {
log.Printf("[ANOMALY] time=%s score=%.4f method=%s details=%s",
res.Timestamp.Format(time.RFC3339), res.Score, res.Method, res.Details)
}
}
}()
// Optionally log SEAD weights periodically (when using SEAD ensemble).
if ens, ok := detector.(*detect.EnsembleDetector); ok {
go func() {
t := time.NewTicker(60 * time.Second)
defer t.Stop()
for {
select {
case <-ctx.Done():
return
case <-t.C:
if ws := ens.WeightSummary(); ws != "" {
log.Printf("[SEAD weights] %s", ws)
}
}
}
}()
}
log.Println("pipeline started waiting for SIGINT / SIGTERM")
<-ctx.Done()
log.Println("shutting down…")
metricColl.Wait()
logColl.Wait()
engine.Wait()
close(featureChan)
detLayer.Wait()
close(anomalyChan)
sinkWg.Wait()
hm.Wait()
log.Println("pipeline stopped")
}
// buildDetector constructs the configured AnomalyDetector.
//
// Routing:
// 1. detector.ensemble.enabled = true → EnsembleDetector with the method
// specified by detector.ensemble.method ("avg"|"max"|"median"|"sead").
// 2. Otherwise fall through to detector.method ("copod"|"rrcf"|"isolation_forest").
func buildDetector(cfg *config.Config) (detect.AnomalyDetector, error) {
if cfg.Detection.Ensemble.Enabled {
method := detect.EnsembleMethod(cfg.Detection.Ensemble.Method)
if method == "" {
method = detect.EnsembleAVG // backward-compat default
}
// Map SEAD config from YAML to detect.SEADConfig.
seadCfg := detect.SEADConfig{
Eta: cfg.Detection.Ensemble.SEAD.Eta,
Lambda: cfg.Detection.Ensemble.SEAD.Lambda,
QuantileWindow: cfg.Detection.Ensemble.SEAD.QuantileWindow,
MinDataPoints: cfg.Detection.Ensemble.SEAD.MinDataPoints,
Contamination: cfg.Detection.Ensemble.Contamination,
}
// Apply defaults for zero-value fields.
if seadCfg.Eta == 0 {
seadCfg.Eta = 0.10
}
if seadCfg.QuantileWindow == 0 {
seadCfg.QuantileWindow = 300
}
if seadCfg.MinDataPoints == 0 {
seadCfg.MinDataPoints = 20
}
det, err := detect.NewEnsembleDetector(
method,
cfg.Detection.COPOD.BufferSize,
cfg.Detection.COPOD.Threshold,
detect.RRCFVariantsConfig{
Fast: detect.RRCFVariantConfig{
NumTrees: cfg.Detection.RRCFVariants.Fast.NumTrees,
TreeSize: cfg.Detection.RRCFVariants.Fast.TreeSize,
ThresholdPercentile: cfg.Detection.RRCFVariants.Fast.ThresholdPercentile,
},
Mid: detect.RRCFVariantConfig{
NumTrees: cfg.Detection.RRCFVariants.Mid.NumTrees,
TreeSize: cfg.Detection.RRCFVariants.Mid.TreeSize,
ThresholdPercentile: cfg.Detection.RRCFVariants.Mid.ThresholdPercentile,
},
Slow: detect.RRCFVariantConfig{
NumTrees: cfg.Detection.RRCFVariants.Slow.NumTrees,
TreeSize: cfg.Detection.RRCFVariants.Slow.TreeSize,
ThresholdPercentile: cfg.Detection.RRCFVariants.Slow.ThresholdPercentile,
},
},
cfg.Detection.Ensemble.Contamination,
seadCfg,
)
if err != nil {
return nil, fmt.Errorf("build ensemble detector (%s): %w", method, err)
}
log.Printf("detector: Ensemble method=%s contamination=%.2f", method, cfg.Detection.Ensemble.Contamination)
if method == detect.EnsembleSEAD {
log.Printf("detector: SEAD η=%.3f λ=%.3f quantile_window=%d",
seadCfg.Eta, seadCfg.Lambda, seadCfg.QuantileWindow)
// Wrap in SwitchableDetector if using SEAD (required for 3-stage scaling).
if sead := det.SEAD(); sead != nil {
return detect.NewSwitchableDetector(sead), nil
}
}
return det, nil
}
switch cfg.Detection.Method {
case "copod":
return detect.NewCOPODDetector(
cfg.Detection.COPOD.BufferSize,
cfg.Detection.COPOD.Threshold,
)
case "rrcf":
return detect.NewRRCFDetector(
cfg.Detection.RRCF.NumTrees,
cfg.Detection.RRCF.TreeSize,
0,
cfg.Detection.RRCF.ThresholdPercentile,
), nil
default: // "isolation_forest"
return detect.NewIsolationForestDetector(
5_000, 100, 100, 256, 0.05, 10.0,
), nil
}
}
func openLog(path, label string) *os.File {
if path == "" {
return nil
}
f, err := os.OpenFile(path, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0o644)
if err != nil {
log.Printf("warning: cannot open %s %q: %v", label, path, err)
return nil
}
return f
}
func maybeWriter(f *os.File) *bufio.Writer {
if f == nil {
return nil
}
return bufio.NewWriterSize(f, 64*1024)
}
func writeJSON(w *bufio.Writer, v any) {
if w == nil {
return
}
b, err := json.Marshal(v)
if err != nil {
log.Printf("marshal: %v", err)
return
}
if _, err := w.Write(append(b, '\n')); err != nil {
log.Printf("write log: %v", err)
return
}
if err := w.Flush(); err != nil {
log.Printf("flush log: %v", err)
}
}

123
configs/default.yaml Normal file
View file

@ -0,0 +1,123 @@
ingestion:
log_path: "/path/to/log/file/transfer.log"
net_interface: "ens4"
disk_device: "vda1"
systemctl_services:
- service1.service
- service2.service
transformation:
window_size: "30s"
db_path: "data/pipeline_test.duckdb"
drain:
depth: 4
sim_threshold: 0.4
max_children: 100
max_clusters: 1000
masking_patterns:
- name: "loglevel"
pattern: '^(\S+)'
replace: "<LOGLEVEL>"
type: "string"
- name: ""
pattern: '(\d{4}-\d{2}-\d{2})'
replace: "<DATE>"
type: "string"
- name: ""
pattern: '(\d{2}:\d{2}:\d{2}\.\d{6})'
replace: "<TIME>"
type: "string"
- name: "uuid"
pattern: '\b[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}\b'
replace: "<UUID>"
type: "string"
- name: ""
pattern: '\+\]'
replace: "<SESSION>"
type: "string"
- name: ""
pattern: "(/[a-zA-Z0-9._-]+)+"
replace: "<PATH>"
type: "string"
- name: ""
pattern: '(sync-file-reader|checksum|xp-network-(?:sender|receiver)|aes-crypt)-\d+:'
replace: "<MODULE>:"
type: "string"
- name: "datarate"
pattern: 'datarate=\s*(\d+(?:\.\d+)?)'
replace: "<datarate>"
type: "float"
- name: "duration"
pattern: 'duration=\s*(\d+(?:\.\d+)?)'
replace: "<duration>"
type: "float"
- name: "throughput"
pattern: 'throughput=\s*(\d+(?:\.\d+)?)'
replace: "<throughput>"
type: "float"
- name: "filesize"
pattern: '(\d+(?:\.\d+)?)\s*(?:MByte|GByte|MiB|GiB|GB|MB|KB)'
replace: "<filesize>"
type: "float"
- name: "hostport"
pattern: '([a-zA-Z0-9.-]+:\d+)'
replace: "<HOSTPORT>"
type: "string"
- name: ""
pattern: '\b(\d+(?:\.\d+)?)\b'
replace: "<NUM>"
type: "float"
detector:
method: "ensemble"
ensemble:
enabled: true
method: "sead"
contamination: 0.15
sead:
eta: 0.1
lambda: 0.01
auto_scaling:
enabled: true
high_threshold: 75.0
critical_threshold: 90.
high_duration: 90.0
critical_duration: 120.0
down_threshold: 50.0
down_duration: 120.0
rrcf_variants:
fast:
num_trees: 50
tree_size: 32
threshold_percentile: 0.85
mid:
num_trees: 150
tree_size: 64
threshold_percentile: 0.85
slow:
num_trees: 200
tree_size: 128
threshold_percentile: 0.85
copod:
buffer_size: 50
threshold: 0.3
mad:
threshold: 3.5
calibration_size: 50
output:
feature_log_path: "logs/features.jsonl"
anomaly_log_path: "logs/anomalies.jsonl"

49
go.mod Normal file
View file

@ -0,0 +1,49 @@
module codeberg.org/pata1704/guenther
go 1.25.5
require (
codeberg.org/pata1704/copod v0.0.0-20260308082005-aded842ae0c1
codeberg.org/pata1704/drain3 v1.0.0
codeberg.org/pata1704/rrcf v0.0.0-20260305123746-25e149fa69ba
github.com/apache/arrow-go/v18 v18.5.1
github.com/duckdb/duckdb-go/v2 v2.5.5
github.com/e-XpertSolutions/go-iforest v1.0.0
github.com/fsnotify/fsnotify v1.9.0
github.com/stretchr/testify v1.11.1
gopkg.in/yaml.v3 v3.0.1
)
require (
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
github.com/duckdb/duckdb-go-bindings v0.3.3 // indirect
github.com/duckdb/duckdb-go-bindings/lib/darwin-amd64 v0.3.3 // indirect
github.com/duckdb/duckdb-go-bindings/lib/darwin-arm64 v0.3.3 // indirect
github.com/duckdb/duckdb-go-bindings/lib/linux-amd64 v0.3.3 // indirect
github.com/duckdb/duckdb-go-bindings/lib/linux-arm64 v0.3.3 // indirect
github.com/duckdb/duckdb-go-bindings/lib/windows-amd64 v0.3.3 // indirect
github.com/dustin/go-humanize v1.0.1 // indirect
github.com/go-viper/mapstructure/v2 v2.5.0 // indirect
github.com/goccy/go-json v0.10.5 // indirect
github.com/google/flatbuffers v25.12.19+incompatible // indirect
github.com/google/uuid v1.6.0 // indirect
github.com/klauspost/compress v1.18.3 // indirect
github.com/klauspost/cpuid/v2 v2.3.0 // indirect
github.com/mattn/go-isatty v0.0.20 // indirect
github.com/ncruces/go-strftime v1.0.0 // indirect
github.com/pierrec/lz4/v4 v4.1.25 // indirect
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect
github.com/zeebo/xxh3 v1.1.0 // indirect
golang.org/x/exp v0.0.0-20260112195511-716be5621a96 // indirect
golang.org/x/mod v0.32.0 // indirect
golang.org/x/sync v0.19.0 // indirect
golang.org/x/sys v0.40.0 // indirect
golang.org/x/telemetry v0.0.0-20260116145544-c6413dc483f5 // indirect
golang.org/x/tools v0.41.0 // indirect
golang.org/x/xerrors v0.0.0-20240903120638-7835f813f4da // indirect
modernc.org/libc v1.67.6 // indirect
modernc.org/mathutil v1.7.1 // indirect
modernc.org/memory v1.11.0 // indirect
modernc.org/sqlite v1.44.1 // indirect
)

125
go.sum Normal file
View file

@ -0,0 +1,125 @@
codeberg.org/pata1704/copod v0.0.0-20260308082005-aded842ae0c1 h1:DoXV7m58nWibyIvVaUj4AVyVM/FN1SSpHuiuae+2Pa0=
codeberg.org/pata1704/copod v0.0.0-20260308082005-aded842ae0c1/go.mod h1:IchgVmiksba/DP7BjHiAYKoSrKTe3zrNrFO9QZWNxx0=
codeberg.org/pata1704/drain3 v1.0.0 h1:X66fn+lnzOMU+PFFSkNBF89z1ghbqihE1I4A6x/OJIM=
codeberg.org/pata1704/drain3 v1.0.0/go.mod h1:+K1hIYh3hNSPiXRxUin6ZiC2CC9FDGqQKNNR+7ZIx9s=
codeberg.org/pata1704/rrcf v0.0.0-20260305123746-25e149fa69ba h1:szOyiRopNELsHg9v/Tvif2292MGpgz+Hw9QqTMgildg=
codeberg.org/pata1704/rrcf v0.0.0-20260305123746-25e149fa69ba/go.mod h1:BmI1vkwcwL5tlRVfn3wEDZV+MXQbPMj8w7IsUhelrkA=
github.com/andybalholm/brotli v1.2.0 h1:ukwgCxwYrmACq68yiUqwIWnGY0cTPox/M94sVwToPjQ=
github.com/andybalholm/brotli v1.2.0/go.mod h1:rzTDkvFWvIrjDXZHkuS16NPggd91W3kUSvPlQ1pLaKY=
github.com/apache/arrow-go/v18 v18.5.1 h1:yaQ6zxMGgf9YCYw4/oaeOU3AULySDlAYDOcnr4LdHdI=
github.com/apache/arrow-go/v18 v18.5.1/go.mod h1:OCCJsmdq8AsRm8FkBSSmYTwL/s4zHW9CqxeBxEytkNE=
github.com/apache/thrift v0.22.0 h1:r7mTJdj51TMDe6RtcmNdQxgn9XcyfGDOzegMDRg47uc=
github.com/apache/thrift v0.22.0/go.mod h1:1e7J/O1Ae6ZQMTYdy9xa3w9k+XHWPfRvdPyJeynQ+/g=
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM=
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/duckdb/duckdb-go-bindings v0.3.3 h1:lXogtCY8hiGLQvTfK55HcgvaA3K2MrwKeZGqhIin35U=
github.com/duckdb/duckdb-go-bindings v0.3.3/go.mod h1:zS7OpBP8zwVlP38OljRZOnqWYlNd4KLcVfMoA1JFzpk=
github.com/duckdb/duckdb-go-bindings/lib/darwin-amd64 v0.3.3 h1:ue8BtIOSt+2Bt2fEfTAvBcQLxzBFhgfCcyzPtqQWTRA=
github.com/duckdb/duckdb-go-bindings/lib/darwin-amd64 v0.3.3/go.mod h1:EnAvZh1kNJHp5yF+M1ZHNEvapnmt6anq1xXHVrAGqMo=
github.com/duckdb/duckdb-go-bindings/lib/darwin-arm64 v0.3.3 h1:2TrSeTgtwi3WIvub9ba0mny+AClSNo1w0Ghszc2B8lQ=
github.com/duckdb/duckdb-go-bindings/lib/darwin-arm64 v0.3.3/go.mod h1:IGLSeEcFhNeZF16aVjQCULD7TsFZKG5G7SyKJAXKp5c=
github.com/duckdb/duckdb-go-bindings/lib/linux-amd64 v0.3.3 h1:GN0cexhfE7uLb7qgDmsYG324wKF15nW+O7v5+NGalS4=
github.com/duckdb/duckdb-go-bindings/lib/linux-amd64 v0.3.3/go.mod h1:KAIynZ0GHCS7X5fRyuFnQMg/SZBPK/bS9OCOVojClxw=
github.com/duckdb/duckdb-go-bindings/lib/linux-arm64 v0.3.3 h1:bIJV+ct6yvMXjy+N3bfILFd0fkTK50AUhUTerkY40/8=
github.com/duckdb/duckdb-go-bindings/lib/linux-arm64 v0.3.3/go.mod h1:81SGOYoEUs8qaAfSk1wRfM5oobrIJ5KI7AzYhK6/bvQ=
github.com/duckdb/duckdb-go-bindings/lib/windows-amd64 v0.3.3 h1:SK2sunA/MPb2T3113iFzHv6DWeu+qrsw0DizTFrvM+Q=
github.com/duckdb/duckdb-go-bindings/lib/windows-amd64 v0.3.3/go.mod h1:K25pJL26ARblGDeuAkrdblFvUen92+CwksLtPEHRqqQ=
github.com/duckdb/duckdb-go/v2 v2.5.5 h1:TlK8ipnzoKW2aNrjGqRkFWLCDpJDxR/VwH8ezEcvVhw=
github.com/duckdb/duckdb-go/v2 v2.5.5/go.mod h1:6uIbC3gz36NCEygECzboygOo/Z9TeVwox/puG+ohWV0=
github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto=
github.com/e-XpertSolutions/go-iforest v1.0.0 h1:x8IN5xsmugc9VsVyHlBtR7EY9tEacBX7A5dwXXh1y94=
github.com/e-XpertSolutions/go-iforest v1.0.0/go.mod h1:t3C4RgLJcVtm2sOOXB+UTbwGiT+TPQAeP9daEWR4C8c=
github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S9k=
github.com/fsnotify/fsnotify v1.9.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0=
github.com/go-viper/mapstructure/v2 v2.5.0 h1:vM5IJoUAy3d7zRSVtIwQgBj7BiWtMPfmPEgAXnvj1Ro=
github.com/go-viper/mapstructure/v2 v2.5.0/go.mod h1:oJDH3BJKyqBA2TXFhDsKDGDTlndYOZ6rGS0BRZIxGhM=
github.com/goccy/go-json v0.10.5 h1:Fq85nIqj+gXn/S5ahsiTlK3TmC85qgirsdTP/+DeaC4=
github.com/goccy/go-json v0.10.5/go.mod h1:oq7eo15ShAhp70Anwd5lgX2pLfOS3QCiwU/PULtXL6M=
github.com/golang/snappy v1.0.0 h1:Oy607GVXHs7RtbggtPBnr2RmDArIsAefDwvrdWvRhGs=
github.com/golang/snappy v1.0.0/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
github.com/google/flatbuffers v25.12.19+incompatible h1:haMV2JRRJCe1998HeW/p0X9UaMTK6SDo0ffLn2+DbLs=
github.com/google/flatbuffers v25.12.19+incompatible/go.mod h1:1AeVuKshWv4vARoZatz6mlQ0JxURH0Kv5+zNeJKJCa8=
github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e h1:ijClszYn+mADRFY17kjQEVQ1XRhq2/JR1M3sGqeJoxs=
github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA=
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k=
github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM=
github.com/klauspost/asmfmt v1.3.2 h1:4Ri7ox3EwapiOjCki+hw14RyKk201CN4rzyCJRFLpK4=
github.com/klauspost/asmfmt v1.3.2/go.mod h1:AG8TuvYojzulgDAMCnYn50l/5QV3Bs/tp6j0HLHbNSE=
github.com/klauspost/compress v1.18.3 h1:9PJRvfbmTabkOX8moIpXPbMMbYN60bWImDDU7L+/6zw=
github.com/klauspost/compress v1.18.3/go.mod h1:R0h/fSBs8DE4ENlcrlib3PsXS61voFxhIs2DeRhCvJ4=
github.com/klauspost/cpuid/v2 v2.3.0 h1:S4CRMLnYUhGeDFDqkGriYKdfoFlDnMtqTiI/sFzhA9Y=
github.com/klauspost/cpuid/v2 v2.3.0/go.mod h1:hqwkgyIinND0mEev00jJYCxPNVRVXFQeu1XKlok6oO0=
github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
github.com/minio/asm2plan9s v0.0.0-20200509001527-cdd76441f9d8 h1:AMFGa4R4MiIpspGNG7Z948v4n35fFGB3RR3G/ry4FWs=
github.com/minio/asm2plan9s v0.0.0-20200509001527-cdd76441f9d8/go.mod h1:mC1jAcsrzbxHt8iiaC+zU4b1ylILSosueou12R++wfY=
github.com/minio/c2goasm v0.0.0-20190812172519-36a3d3bbc4f3 h1:+n/aFZefKZp7spd8DFdX7uMikMLXX4oubIzJF4kv/wI=
github.com/minio/c2goasm v0.0.0-20190812172519-36a3d3bbc4f3/go.mod h1:RagcQ7I8IeTMnF8JTXieKnO4Z6JCsikNEzj0DwauVzE=
github.com/ncruces/go-strftime v1.0.0 h1:HMFp8mLCTPp341M/ZnA4qaf7ZlsbTc+miZjCLOFAw7w=
github.com/ncruces/go-strftime v1.0.0/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls=
github.com/pierrec/lz4/v4 v4.1.25 h1:kocOqRffaIbU5djlIBr7Wh+cx82C0vtFb0fOurZHqD0=
github.com/pierrec/lz4/v4 v4.1.25/go.mod h1:EoQMVJgeeEOMsCqCzqFm2O0cJvljX2nGZjcRIPL34O4=
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U=
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE=
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo=
github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
github.com/zeebo/assert v1.3.0 h1:g7C04CbJuIDKNPFHmsk4hwZDO5O+kntRxzaUoNXj+IQ=
github.com/zeebo/assert v1.3.0/go.mod h1:Pq9JiuJQpG8JLJdtkwrJESF0Foym2/D9XMU5ciN/wJ0=
github.com/zeebo/xxh3 v1.1.0 h1:s7DLGDK45Dyfg7++yxI0khrfwq9661w9EN78eP/UZVs=
github.com/zeebo/xxh3 v1.1.0/go.mod h1:IisAie1LELR4xhVinxWS5+zf1lA4p0MW4T+w+W07F5s=
golang.org/x/exp v0.0.0-20260112195511-716be5621a96 h1:Z/6YuSHTLOHfNFdb8zVZomZr7cqNgTJvA8+Qz75D8gU=
golang.org/x/exp v0.0.0-20260112195511-716be5621a96/go.mod h1:nzimsREAkjBCIEFtHiYkrJyT+2uy9YZJB7H1k68CXZU=
golang.org/x/mod v0.32.0 h1:9F4d3PHLljb6x//jOyokMv3eX+YDeepZSEo3mFJy93c=
golang.org/x/mod v0.32.0/go.mod h1:SgipZ/3h2Ci89DlEtEXWUk/HteuRin+HHhN+WbNhguU=
golang.org/x/sync v0.19.0 h1:vV+1eWNmZ5geRlYjzm2adRgW2/mcpevXNg50YZtPCE4=
golang.org/x/sync v0.19.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI=
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.40.0 h1:DBZZqJ2Rkml6QMQsZywtnjnnGvHza6BTfYFWY9kjEWQ=
golang.org/x/sys v0.40.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
golang.org/x/telemetry v0.0.0-20260116145544-c6413dc483f5 h1:i0p03B68+xC1kD2QUO8JzDTPXCzhN56OLJ+IhHY8U3A=
golang.org/x/telemetry v0.0.0-20260116145544-c6413dc483f5/go.mod h1:b7fPSJ0pKZ3ccUh8gnTONJxhn3c/PS6tyzQvyqw4iA8=
golang.org/x/tools v0.41.0 h1:a9b8iMweWG+S0OBnlU36rzLp20z1Rp10w+IY2czHTQc=
golang.org/x/tools v0.41.0/go.mod h1:XSY6eDqxVNiYgezAVqqCeihT4j1U2CCsqvH3WhQpnlg=
golang.org/x/xerrors v0.0.0-20240903120638-7835f813f4da h1:noIWHXmPHxILtqtCOPIhSt0ABwskkZKjD3bXGnZGpNY=
golang.org/x/xerrors v0.0.0-20240903120638-7835f813f4da/go.mod h1:NDW/Ps6MPRej6fsCIbMTohpP40sJ/P/vI1MoTEGwX90=
gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk=
gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
modernc.org/cc/v4 v4.27.1 h1:9W30zRlYrefrDV2JE2O8VDtJ1yPGownxciz5rrbQZis=
modernc.org/cc/v4 v4.27.1/go.mod h1:uVtb5OGqUKpoLWhqwNQo/8LwvoiEBLvZXIQ/SmO6mL0=
modernc.org/ccgo/v4 v4.30.1 h1:4r4U1J6Fhj98NKfSjnPUN7Ze2c6MnAdL0hWw6+LrJpc=
modernc.org/ccgo/v4 v4.30.1/go.mod h1:bIOeI1JL54Utlxn+LwrFyjCx2n2RDiYEaJVSrgdrRfM=
modernc.org/fileutil v1.3.40 h1:ZGMswMNc9JOCrcrakF1HrvmergNLAmxOPjizirpfqBA=
modernc.org/fileutil v1.3.40/go.mod h1:HxmghZSZVAz/LXcMNwZPA/DRrQZEVP9VX0V4LQGQFOc=
modernc.org/gc/v2 v2.6.5 h1:nyqdV8q46KvTpZlsw66kWqwXRHdjIlJOhG6kxiV/9xI=
modernc.org/gc/v2 v2.6.5/go.mod h1:YgIahr1ypgfe7chRuJi2gD7DBQiKSLMPgBQe9oIiito=
modernc.org/gc/v3 v3.1.1 h1:k8T3gkXWY9sEiytKhcgyiZ2L0DTyCQ/nvX+LoCljoRE=
modernc.org/gc/v3 v3.1.1/go.mod h1:HFK/6AGESC7Ex+EZJhJ2Gni6cTaYpSMmU/cT9RmlfYY=
modernc.org/goabi0 v0.2.0 h1:HvEowk7LxcPd0eq6mVOAEMai46V+i7Jrj13t4AzuNks=
modernc.org/goabi0 v0.2.0/go.mod h1:CEFRnnJhKvWT1c1JTI3Avm+tgOWbkOu5oPA8eH8LnMI=
modernc.org/libc v1.67.6 h1:eVOQvpModVLKOdT+LvBPjdQqfrZq+pC39BygcT+E7OI=
modernc.org/libc v1.67.6/go.mod h1:JAhxUVlolfYDErnwiqaLvUqc8nfb2r6S6slAgZOnaiE=
modernc.org/mathutil v1.7.1 h1:GCZVGXdaN8gTqB1Mf/usp1Y/hSqgI2vAGGP4jZMCxOU=
modernc.org/mathutil v1.7.1/go.mod h1:4p5IwJITfppl0G4sUEDtCr4DthTaT47/N3aT6MhfgJg=
modernc.org/memory v1.11.0 h1:o4QC8aMQzmcwCK3t3Ux/ZHmwFPzE6hf2Y5LbkRs+hbI=
modernc.org/memory v1.11.0/go.mod h1:/JP4VbVC+K5sU2wZi9bHoq2MAkCnrt2r98UGeSK7Mjw=
modernc.org/opt v0.1.4 h1:2kNGMRiUjrp4LcaPuLY2PzUfqM/w9N23quVwhKt5Qm8=
modernc.org/opt v0.1.4/go.mod h1:03fq9lsNfvkYSfxrfUhZCWPk1lm4cq4N+Bh//bEtgns=
modernc.org/sortutil v1.2.1 h1:+xyoGf15mM3NMlPDnFqrteY07klSFxLElE2PVuWIJ7w=
modernc.org/sortutil v1.2.1/go.mod h1:7ZI3a3REbai7gzCLcotuw9AC4VZVpYMjDzETGsSMqJE=
modernc.org/sqlite v1.44.1 h1:qybx/rNpfQipX/t47OxbHmkkJuv2JWifCMH8SVUiDas=
modernc.org/sqlite v1.44.1/go.mod h1:CzbrU2lSB1DKUusvwGz7rqEKIq+NUd8GWuBBZDs9/nA=
modernc.org/strutil v1.2.1 h1:UneZBkQA+DX2Rp35KcM69cSsNES9ly8mQWD71HKlOA0=
modernc.org/strutil v1.2.1/go.mod h1:EHkiggD70koQxjVdSBM3JKM7k6L0FbGE5eymy9i3B9A=
modernc.org/token v1.1.0 h1:Xl7Ap9dKaEs5kLoOQeQmPWevfnk/DM5qcLcYlA8ys6Y=
modernc.org/token v1.1.0/go.mod h1:UGzOrNV1mAFSEB63lOFHIpNRUVMvYTc6yu1SMY/XTDM=

250
internal/collector/log.go Normal file
View file

@ -0,0 +1,250 @@
package collector
import (
"bufio"
"context"
"fmt"
"io"
"log"
"os"
"strings"
"sync"
"sync/atomic"
"time"
drain3go "codeberg.org/pata1704/drain3"
"codeberg.org/pata1704/guenther/internal/config"
idrain3 "codeberg.org/pata1704/guenther/internal/drain3"
"codeberg.org/pata1704/guenther/pkg/types"
"github.com/fsnotify/fsnotify"
)
// linePool recycles *strings.Builder instances used in the line-read hot path
// to reduce allocations when processing high-volume log files.
var linePool = sync.Pool{
New: func() any { return new(strings.Builder) },
}
// LogCollector tails a log file using inotify (fsnotify) and emits a
// types.LogEvent for every non-empty line.
//
// Processing pipeline per line:
// 1. ApplyMasking extracts named parameters and masks the line.
// 2. Drain3.Parse mines a template ID from the masked line.
// 3. Severity classified from the raw line.
// 4. Emit non-blocking channel send with drop counter.
//
// The collector uses a single goroutine per file and a WaitGroup for clean
// shutdown.
type LogCollector struct {
cfg *config.Config
miner *drain3go.TemplateMiner
outputChan chan<- types.LogEvent
healthChan chan<- types.StageHealth
wg sync.WaitGroup
processed atomic.Uint64
dropped atomic.Uint64
}
// NewLogCollector creates a LogCollector wired to the provided channels.
// Drain3 is initialised with an in-memory persistence store; the template
// tree is rebuilt from scratch on restart (state persistence can be added
// via FilePersistence if needed).
func NewLogCollector(
cfg *config.Config,
output chan<- types.LogEvent,
health chan<- types.StageHealth,
) *LogCollector {
dc := drain3go.DefaultConfig()
dc.SimTh = cfg.Drain.SimThreshold
dc.Depth = cfg.Drain.Depth
dc.MaxChildren = cfg.Drain.MaxChildren
miner := drain3go.NewTemplateMiner(dc, drain3go.NewMemoryPersistence())
return &LogCollector{
cfg: cfg,
miner: miner,
outputChan: output,
healthChan: health,
}
}
// Start begins tailing cfg.Ingestion.LogPath.
// The method returns an error if the file cannot be opened or if the
// inotify watcher cannot be created. Subsequent errors during tailing are
// logged but do not propagate.
func (c *LogCollector) Start(ctx context.Context) error {
f, err := os.Open(c.cfg.Ingestion.LogPath)
if err != nil {
return fmt.Errorf("log collector: open %q: %w", c.cfg.Ingestion.LogPath, err)
}
// Seek to end: only tail new content, not existing content.
if _, err := f.Seek(0, io.SeekEnd); err != nil {
f.Close()
return fmt.Errorf("log collector: seek %q: %w", c.cfg.Ingestion.LogPath, err)
}
watcher, err := fsnotify.NewWatcher()
if err != nil {
f.Close()
return fmt.Errorf("log collector: create fsnotify watcher: %w", err)
}
if err := watcher.Add(c.cfg.Ingestion.LogPath); err != nil {
f.Close()
watcher.Close()
return fmt.Errorf("log collector: watch %q: %w", c.cfg.Ingestion.LogPath, err)
}
reader := bufio.NewReaderSize(f, 64*1024)
reportTicker := time.NewTicker(5 * time.Second)
c.wg.Go(func() {
defer f.Close()
defer watcher.Close()
defer reportTicker.Stop()
for {
select {
case event, ok := <-watcher.Events:
if !ok {
return
}
if event.Has(fsnotify.Write) {
c.drainReader(reader)
}
if event.Has(fsnotify.Remove) || event.Has(fsnotify.Rename) {
// Log rotation: reopen the file.
log.Printf("log collector: file %q rotated reopening", c.cfg.Ingestion.LogPath)
f.Close()
newF, err := c.reopenFile()
if err != nil {
log.Printf("log collector: reopen after rotation: %v", err)
return
}
f = newF
reader = bufio.NewReaderSize(f, 64*1024)
if err := watcher.Add(c.cfg.Ingestion.LogPath); err != nil {
log.Printf("log collector: re-watch after rotation: %v", err)
}
}
case err, ok := <-watcher.Errors:
if !ok {
return
}
log.Printf("log collector: watcher error: %v", err)
case <-reportTicker.C:
c.emitHealth()
case <-ctx.Done():
return
}
}
})
return nil
}
// Wait waits for the collector goroutine to exit after context cancellation.
func (c *LogCollector) Wait() {
c.wg.Wait()
}
// drainReader reads all complete lines currently available in reader and
// processes each one. Partial lines (no trailing newline) are left in the
// bufio buffer for the next Write event.
func (c *LogCollector) drainReader(r *bufio.Reader) {
for {
line, err := r.ReadString('\n')
if len(line) > 0 {
c.processLine(strings.TrimRight(line, "\r\n"))
}
if err != nil {
// io.EOF means no more complete lines; any other error is logged.
if err != io.EOF {
log.Printf("log collector: read error: %v", err)
}
return
}
}
}
// processLine applies masking, mines a Drain3 template, classifies severity,
// and emits a LogEvent. The send is non-blocking; full channels increment the
// dropped counter if the pipeline is backlogged.
func (c *LogCollector) processLine(line string) {
if line == "" {
return
}
// Phase 1+2: masking and parameter extraction.
masked, params := idrain3.ApplyMasking(line, c.cfg.Drain.MaskingPatterns)
// Phase 3: template mining on the masked line.
result := c.miner.AddLogMessage(masked)
if result == nil {
return
}
event := types.LogEvent{
Timestamp: time.Now(),
TemplateID: result.ClusterID,
Params: params,
Severity: classifySeverity(line),
RawLine: line,
}
select {
case c.outputChan <- event:
c.processed.Add(1)
default:
c.dropped.Add(1)
}
}
// reopenFile opens cfg.Ingestion.LogPath after log rotation, seeking to the
// beginning of the new file.
func (c *LogCollector) reopenFile() (*os.File, error) {
f, err := os.Open(c.cfg.Ingestion.LogPath)
if err != nil {
return nil, fmt.Errorf("open: %w", err)
}
return f, nil
}
// emitHealth sends a StageHealth snapshot; non-blocking (drops if full).
func (c *LogCollector) emitHealth() {
p := c.processed.Load()
d := c.dropped.Load()
select {
case c.healthChan <- types.StageHealth{
StageName: "log_collector",
EventsProcessed: p,
EventsDropped: d,
Throughput: float64(p) / 5.0,
LastUpdate: time.Now(),
}:
default:
}
}
// classifySeverity extracts the severity level from a raw log line by
// scanning for well-known keywords (case-insensitive).
func classifySeverity(line string) string {
upper := strings.ToUpper(line)
switch {
case strings.Contains(upper, "ERROR") || strings.Contains(upper, "FATAL") || strings.Contains(upper, "CRITICAL") || strings.Contains(upper, "ERR"):
return "ERROR"
case strings.Contains(upper, "WARN") || strings.Contains(upper, "WARNING"):
return "WARN"
case strings.Contains(upper, "DEBUG"):
return "DEBUG"
default:
return "INFO"
}
}

View file

@ -0,0 +1,45 @@
package collector
import (
"os"
"testing"
"time"
"codeberg.org/pata1704/guenther/internal/config"
"codeberg.org/pata1704/guenther/pkg/types"
"github.com/stretchr/testify/assert"
)
func TestLogCollector_ProcessLine(t *testing.T) {
// 1. Create temporary log file
tmpFile, err := os.CreateTemp("", "test_log_*.log")
assert.NoError(t, err)
defer os.Remove(tmpFile.Name())
outputChan := make(chan types.LogEvent, 10)
healthChan := make(chan types.StageHealth, 10)
cfg := &config.Config{}
cfg.Ingestion.LogPath = tmpFile.Name()
cfg.Drain.Depth = 4
cfg.Drain.SimThreshold = 0.5
cfg.Drain.MaxChildren = 100
collector := NewLogCollector(cfg, outputChan, healthChan)
// 2. Test line processing with specific regex patterns
testLine := "2026-02-26 13:00:00.123456 INFO Transfer from 192.168.1.1:8080 completed (duration=1.23)"
collector.processLine(testLine)
select {
case ev := <-outputChan:
assert.Equal(t, "INFO", ev.Severity)
assert.Greater(t, ev.TemplateID, 0)
t.Logf("Extracted parameters: %v", ev.Params)
// Unconfigured Drain3 template yields empty map
assert.GreaterOrEqual(t, len(ev.Params), 0)
case <-time.After(1 * time.Second):
t.Fatal("Timeout waiting for LogEvent")
}
}

View file

@ -0,0 +1,542 @@
package collector
import (
"bufio"
"context"
"log"
"os"
"strconv"
"strings"
"sync"
"sync/atomic"
"time"
"codeberg.org/pata1704/guenther/pkg/types"
)
// MetricCollector samples Linux system metrics from /proc at a fixed interval
// and emits a types.MetricSnapshot for each sample.
//
// All /proc reads happen in the single collector goroutine, so no locking is
// required for the delta-state fields. The output channel uses a non-blocking
// send; overflows are counted in the dropped counter via load-shedding.
type MetricCollector struct {
outputChan chan<- types.MetricSnapshot
healthChan chan<- types.StageHealth
interval time.Duration
netInterface string
diskDevice string
wg sync.WaitGroup
// Delta state only accessed from the single collector goroutine.
prevSoftnetDropped uint64
prevSoftnetSqueeze uint64
prevNetPacketsIn uint64
prevNetPacketsOut uint64
prevDiskReadsComp uint64
prevDiskWritesComp uint64
prevDiskRead uint64
prevDiskWrite uint64
prevDiskReadTimeMs uint64
prevDiskWriteTimeMs uint64
prevDiskIOTicks uint64
prevCPUTotal uint64
prevCPUIdle uint64
prevCPUIoWait uint64
prevCPUSoftIrq uint64
prevCtxt uint64
prevIntr uint64
prevNetIn uint64
prevNetOut uint64
prevNetErrs uint64
prevNetDrops uint64
prevTCPRetrans uint64
prevTCPTimeouts uint64
prevTCPLostRetrans uint64
prevTCPFastRetrans uint64
prevTime time.Time
firstSample bool
processed atomic.Uint64
dropped atomic.Uint64
}
func NewMetricCollector(
output chan<- types.MetricSnapshot,
health chan<- types.StageHealth,
interval time.Duration,
netIntf, diskDev string,
) *MetricCollector {
return &MetricCollector{
outputChan: output,
healthChan: health,
interval: interval,
netInterface: netIntf,
diskDevice: diskDev,
firstSample: true,
}
}
func (c *MetricCollector) Start(ctx context.Context) {
ticker := time.NewTicker(c.interval)
reportTicker := time.NewTicker(5 * time.Second)
c.prevTime = time.Now()
c.wg.Go(func() {
defer ticker.Stop()
defer reportTicker.Stop()
for {
select {
case <-ticker.C:
snap := c.collect()
if snap == nil {
continue
}
select {
case c.outputChan <- *snap:
c.processed.Add(1)
default:
c.dropped.Add(1)
}
case <-reportTicker.C:
c.emitHealth()
case <-ctx.Done():
return
}
}
})
}
// Wait waits for the collector goroutine to exit after context cancellation.
func (c *MetricCollector) Wait() {
c.wg.Wait()
}
// ── collection ────────────────────────────────────────────────────────────────
func (c *MetricCollector) collect() *types.MetricSnapshot {
now := time.Now()
duration := now.Sub(c.prevTime).Seconds()
cpuTotal, cpuIdle, cpuIowait, cpuSoftirq, ctxt, intr := c.readSystemStats()
memUsed, memCached, memDirty := c.readMemInfo()
netIn, netOut, netErrs, netDrops, rxPackets, txPackets := c.readNetDev()
retrans := c.readSNMPStats()
timeouts, lostRetrans, fastRetrans := c.readNetstat()
softDropped, softSqueeze := c.readSoftnetStat()
diskRead, diskWrite, diskReadTime, diskWriteTime, diskIOTicks, readsComp, writesComp := c.readDiskStats()
if c.firstSample {
c.storePrev(now,
cpuTotal, cpuIdle, cpuIowait, cpuSoftirq, ctxt, intr,
netIn, netOut, netErrs, netDrops, rxPackets, txPackets,
retrans, timeouts, lostRetrans, fastRetrans,
softDropped, softSqueeze,
diskRead, diskWrite, diskReadTime, diskWriteTime, diskIOTicks, readsComp, writesComp)
c.firstSample = false
return nil
}
if duration < 1e-6 {
duration = 1e-6
}
cpuDelta := saturatingSub(cpuTotal, c.prevCPUTotal)
cpuIdleDelta := saturatingSub(cpuIdle, c.prevCPUIdle)
cpuPercent, cpuIowaitPercent, cpuSoftirqPercent := 0.0, 0.0, 0.0
if cpuDelta > 0 {
cpuPercent = float64(cpuDelta-cpuIdleDelta) / float64(cpuDelta) * 100.0
cpuIowaitPercent = float64(saturatingSub(cpuIowait, c.prevCPUIoWait)) / float64(cpuDelta) * 100.0
cpuSoftirqPercent = float64(saturatingSub(cpuSoftirq, c.prevCPUSoftIrq)) / float64(cpuDelta) * 100.0
}
snap := &types.MetricSnapshot{
Timestamp: now,
CPUPercent: cpuPercent,
CPUIoWaitPercent: cpuIowaitPercent,
CPUSoftIrqPercent: cpuSoftirqPercent,
ContextSwitchesPerS: float64(saturatingSub(ctxt, c.prevCtxt)) / duration,
InterruptsPerS: float64(saturatingSub(intr, c.prevIntr)) / duration,
MemoryUsedMB: float64(memUsed),
MemoryCachedMB: float64(memCached),
MemoryDirtyMB: float64(memDirty),
NetworkInMBps: float64(saturatingSub(netIn, c.prevNetIn)) / duration / 1_048_576,
NetworkOutMBps: float64(saturatingSub(netOut, c.prevNetOut)) / duration / 1_048_576,
NetErrorsPerS: float64(saturatingSub(netErrs, c.prevNetErrs)) / duration,
NetDropsPerS: float64(saturatingSub(netDrops, c.prevNetDrops)) / duration,
TCPRetransPerS: float64(saturatingSub(retrans, c.prevTCPRetrans)) / duration,
TCPTimeoutsPerS: float64(saturatingSub(timeouts, c.prevTCPTimeouts)) / duration,
TCPLostRetransmitPerS: float64(saturatingSub(lostRetrans, c.prevTCPLostRetrans)) / duration,
TCPFastRetransPerS: float64(saturatingSub(fastRetrans, c.prevTCPFastRetrans)) / duration,
SoftnetDroppedPerS: float64(saturatingSub(softDropped, c.prevSoftnetDropped)) / duration,
SoftnetTimeSqueezePerS: float64(saturatingSub(softSqueeze, c.prevSoftnetSqueeze)) / duration,
DiskReadMBps: float64(saturatingSub(diskRead, c.prevDiskRead)) / duration / 1_048_576,
DiskWriteMBps: float64(saturatingSub(diskWrite, c.prevDiskWrite)) / duration / 1_048_576,
DiskReadTimeMsPerS: float64(saturatingSub(diskReadTime, c.prevDiskReadTimeMs)) / duration,
DiskWriteTimeMsPerS: float64(saturatingSub(diskWriteTime, c.prevDiskWriteTimeMs)) / duration,
DiskIOTicksPerS: float64(saturatingSub(diskIOTicks, c.prevDiskIOTicks)) / duration,
NetPacketsInPerS: float64(saturatingSub(rxPackets, c.prevNetPacketsIn)) / duration,
NetPacketsOutPerS: float64(saturatingSub(txPackets, c.prevNetPacketsOut)) / duration,
DiskReadsCompletedPerS: float64(saturatingSub(readsComp, c.prevDiskReadsComp)) / duration,
DiskWritesCompletedPerS: float64(saturatingSub(writesComp, c.prevDiskWritesComp)) / duration,
}
c.storePrev(now,
cpuTotal, cpuIdle, cpuIowait, cpuSoftirq, ctxt, intr,
netIn, netOut, netErrs, netDrops, rxPackets, txPackets,
retrans, timeouts, lostRetrans, fastRetrans,
softDropped, softSqueeze,
diskRead, diskWrite, diskReadTime, diskWriteTime, diskIOTicks, readsComp, writesComp)
return snap
}
func (c *MetricCollector) storePrev(
now time.Time,
cpuTotal, cpuIdle, cpuIowait, cpuSoftirq, ctxt, intr,
netIn, netOut, netErrs, netDrops, rxPackets, txPackets,
retrans, timeouts, lostRetrans, fastRetrans,
softDropped, softSqueeze,
diskRead, diskWrite, diskReadTime, diskWriteTime, diskIOTicks, readsComp, writesComp uint64,
) {
c.prevTime = now
c.prevCPUTotal = cpuTotal
c.prevCPUIdle = cpuIdle
c.prevCPUIoWait = cpuIowait
c.prevCPUSoftIrq = cpuSoftirq
c.prevCtxt = ctxt
c.prevIntr = intr
c.prevNetIn = netIn
c.prevNetOut = netOut
c.prevNetErrs = netErrs
c.prevNetDrops = netDrops
c.prevTCPRetrans = retrans
c.prevTCPTimeouts = timeouts
c.prevTCPLostRetrans = lostRetrans
c.prevTCPFastRetrans = fastRetrans
c.prevSoftnetDropped = softDropped
c.prevSoftnetSqueeze = softSqueeze
c.prevDiskRead = diskRead
c.prevDiskWrite = diskWrite
c.prevDiskReadTimeMs = diskReadTime
c.prevDiskWriteTimeMs = diskWriteTime
c.prevDiskIOTicks = diskIOTicks
c.prevNetPacketsIn = rxPackets
c.prevNetPacketsOut = txPackets
c.prevDiskReadsComp = readsComp
c.prevDiskWritesComp = writesComp
}
// ── /proc readers ─────────────────────────────────────────────────────────────
// readSystemStats reads /proc/stat and returns cumulative CPU jiffies
// (total, idle, iowait, softirq) plus cumulative context-switches and
// interrupt counts.
//
// /proc/stat CPU column layout:
//
// col 1=user 2=nice 3=system 4=idle 5=iowait 6=irq 7=softirq
func (c *MetricCollector) readSystemStats() (total, idle, iowait, softirq, ctxt, intr uint64) {
f, err := os.Open("/proc/stat")
if err != nil {
log.Printf("metric: open /proc/stat: %v", err)
return
}
defer f.Close()
scanner := bufio.NewScanner(f)
for scanner.Scan() {
fields := strings.Fields(scanner.Text())
if len(fields) == 0 {
continue
}
switch fields[0] {
case "cpu":
for i := 1; i < len(fields); i++ {
v, _ := strconv.ParseUint(fields[i], 10, 64)
total += v
switch i {
case 4:
idle = v
case 5:
iowait = v
case 7:
softirq = v
}
}
case "ctxt":
if len(fields) > 1 {
ctxt, _ = strconv.ParseUint(fields[1], 10, 64)
}
case "intr":
if len(fields) > 1 {
intr, _ = strconv.ParseUint(fields[1], 10, 64)
}
}
}
if err := scanner.Err(); err != nil {
log.Printf("metric: scan /proc/stat: %v", err)
}
return
}
func (c *MetricCollector) readMemInfo() (used, cached, dirty uint64) {
f, err := os.Open("/proc/meminfo")
if err != nil {
log.Printf("metric: open /proc/meminfo: %v", err)
return
}
defer f.Close()
var total, available uint64
scanner := bufio.NewScanner(f)
for scanner.Scan() {
fields := strings.Fields(scanner.Text())
if len(fields) < 2 {
continue
}
val, _ := strconv.ParseUint(fields[1], 10, 64)
switch fields[0] {
case "MemTotal:":
total = val
case "MemAvailable:":
available = val
case "Cached:":
cached = val / 1024 // kB → MB
case "Dirty:":
dirty = val / 1024 // kB → MB
}
}
if err := scanner.Err(); err != nil {
log.Printf("metric: scan /proc/meminfo: %v", err)
}
if total >= available {
used = (total - available) / 1024
}
return
}
// readNetDev reads /proc/net/dev for the configured interface.
//
// /proc/net/dev column layout (after stripping "iface:"):
//
// 0=rx_bytes 1=rx_packets 2=rx_errs 3=rx_drop
// 4=rx_fifo 5=rx_frame 6=rx_compressed 7=rx_multicast
// 8=tx_bytes 9=tx_packets 10=tx_errs 11=tx_drop ...
// 8=tx_bytes 9=tx_packets 10=tx_errs 11=tx_drop ...
func (c *MetricCollector) readNetDev() (rxBytes, txBytes, errs, drops, rxPackets, txPackets uint64) {
f, err := os.Open("/proc/net/dev")
if err != nil {
return 0, 0, 0, 0, 0, 0
}
defer f.Close()
prefix := c.netInterface + ":"
scanner := bufio.NewScanner(f)
for scanner.Scan() {
line := strings.TrimSpace(scanner.Text())
if !strings.HasPrefix(line, prefix) {
continue
}
line = strings.TrimPrefix(line, prefix)
fields := strings.Fields(line)
if len(fields) < 12 {
log.Printf("metric: unexpected /proc/net/dev format for %q", c.netInterface)
return 0, 0, 0, 0, 0, 0
}
rxBytes, _ = strconv.ParseUint(fields[0], 10, 64)
rxPackets, _ = strconv.ParseUint(fields[1], 10, 64)
rxErrs, _ := strconv.ParseUint(fields[2], 10, 64)
rxDrops, _ := strconv.ParseUint(fields[3], 10, 64)
txBytes, _ = strconv.ParseUint(fields[8], 10, 64)
txPackets, _ = strconv.ParseUint(fields[9], 10, 64)
txErrs, _ := strconv.ParseUint(fields[10], 10, 64)
txDrops, _ := strconv.ParseUint(fields[11], 10, 64)
return rxBytes, txBytes, rxErrs + txErrs, rxDrops + txDrops, rxPackets, txPackets
}
if err := scanner.Err(); err != nil {
log.Printf("metric: scan /proc/net/dev: %v", err)
}
return 0, 0, 0, 0, 0, 0
}
// readSNMPStats reads RetransSegs from /proc/net/snmp (Tcp section).
//
// /proc/net/snmp Tcp header order (kernel-stable):
//
// RtoAlgorithm RtoMin RtoMax MaxConn ActiveOpens PassiveOpens
// AttemptFails EstabResets CurrEstab InSegs OutSegs RetransSegs InErrs OutRsts
//
// RetransSegs is at index 12 (0-based) in the value row.
func (c *MetricCollector) readSNMPStats() uint64 {
f, err := os.Open("/proc/net/snmp")
if err != nil {
return 0
}
defer f.Close()
// The file alternates header/value rows for each protocol block.
// We need both rows to find RetransSegs by column name.
scanner := bufio.NewScanner(f)
var tcpHeader []string
for scanner.Scan() {
line := scanner.Text()
if !strings.HasPrefix(line, "Tcp:") {
continue
}
fields := strings.Fields(line)
if tcpHeader == nil {
tcpHeader = fields // first Tcp: line is the header
continue
}
// second Tcp: line is the values
for i, h := range tcpHeader {
if h == "RetransSegs" && i < len(fields) {
v, _ := strconv.ParseUint(fields[i], 10, 64)
return v
}
}
}
if err := scanner.Err(); err != nil {
log.Printf("metric: scan /proc/net/snmp: %v", err)
}
return 0
}
// readNetstat reads TCPTimeouts, TCPLostRetransmit and TCPFastRetrans from
// /proc/net/netstat (TcpExt section). The file alternates header/value rows.
func (c *MetricCollector) readNetstat() (timeouts, lostRetrans, fastRetrans uint64) {
f, err := os.Open("/proc/net/netstat")
if err != nil {
return 0, 0, 0
}
defer f.Close()
scanner := bufio.NewScanner(f)
var headers []string
for scanner.Scan() {
line := scanner.Text()
if !strings.HasPrefix(line, "TcpExt:") {
continue
}
fields := strings.Fields(line)
if headers == nil {
headers = fields
continue
}
// value row
for i, h := range headers {
if i >= len(fields) {
break
}
switch h {
case "TCPTimeouts":
timeouts, _ = strconv.ParseUint(fields[i], 10, 64)
case "TCPLostRetransmit":
lostRetrans, _ = strconv.ParseUint(fields[i], 10, 64)
case "TCPFastRetrans":
fastRetrans, _ = strconv.ParseUint(fields[i], 10, 64)
}
}
}
if err := scanner.Err(); err != nil {
log.Printf("metric: scan /proc/net/netstat: %v", err)
}
return
}
// readSoftnetStat reads /proc/net/softnet_stat and sums dropped and
// time_squeeze across all CPU columns (hex values).
func (c *MetricCollector) readSoftnetStat() (dropped, timeSqueeze uint64) {
f, err := os.Open("/proc/net/softnet_stat")
if err != nil {
return 0, 0
}
defer f.Close()
scanner := bufio.NewScanner(f)
for scanner.Scan() {
fields := strings.Fields(scanner.Text())
// col 0 = total, col 1 = dropped, col 2 = time_squeeze
if len(fields) >= 3 {
d, _ := strconv.ParseUint(fields[1], 16, 64)
t, _ := strconv.ParseUint(fields[2], 16, 64)
dropped += d
timeSqueeze += t
}
}
if err := scanner.Err(); err != nil {
log.Printf("metric: scan /proc/net/softnet_stat: %v", err)
}
return
}
// readDiskStats reads /proc/diskstats for the configured device.
//
// /proc/diskstats column layout (kernel ≥ 4.18):
//
// 0=major 1=minor 2=name
// 3=reads_completed 4=reads_merged 5=sectors_read 6=read_time_ms
// 7=writes_completed 8=writes_merged 9=sectors_written 10=write_time_ms
// 11=io_in_progress 12=io_ticks_ms 13=weighted_io_ticks
// 11=io_in_progress 12=io_ticks_ms 13=weighted_io_ticks
func (c *MetricCollector) readDiskStats() (readBytes, writeBytes, readTimeMs, writeTimeMs, ioTicks, readsComp, writesComp uint64) {
f, err := os.Open("/proc/diskstats")
if err != nil {
log.Printf("metric: open /proc/diskstats: %v", err)
return
}
defer f.Close()
scanner := bufio.NewScanner(f)
for scanner.Scan() {
fields := strings.Fields(scanner.Text())
if len(fields) < 14 || fields[2] != c.diskDevice {
continue
}
readsComp, _ = strconv.ParseUint(fields[3], 10, 64)
writesComp, _ = strconv.ParseUint(fields[7], 10, 64)
rSectors, _ := strconv.ParseUint(fields[5], 10, 64)
wSectors, _ := strconv.ParseUint(fields[9], 10, 64)
rTime, _ := strconv.ParseUint(fields[6], 10, 64)
wTime, _ := strconv.ParseUint(fields[10], 10, 64)
ticks, _ := strconv.ParseUint(fields[12], 10, 64)
return rSectors * 512, wSectors * 512, rTime, wTime, ticks, readsComp, writesComp
}
if err := scanner.Err(); err != nil {
log.Printf("metric: scan /proc/diskstats: %v", err)
}
return
}
// ── health ────────────────────────────────────────────────────────────────────
func (c *MetricCollector) emitHealth() {
p := c.processed.Load()
d := c.dropped.Load()
select {
case c.healthChan <- types.StageHealth{
StageName: "metric_collector",
EventsProcessed: p,
EventsDropped: d,
Throughput: float64(p) / 5.0,
LastUpdate: time.Now(),
}:
default:
}
}
// ── helpers ───────────────────────────────────────────────────────────────────
// saturatingSub returns a b, clamped to 0 on underflow.
// 64-bit /proc counters very rarely wrap, but saturation prevents negative rates.
func saturatingSub(a, b uint64) uint64 {
if a >= b {
return a - b
}
return 0
}

View file

@ -0,0 +1,140 @@
package collector
import (
"bytes"
"context"
"log"
"os/exec"
"strings"
"sync"
"time"
"codeberg.org/pata1704/guenther/pkg/types"
)
// SystemctlCollector periodically checks the status of systemd services.
type SystemctlCollector struct {
services []string
interval time.Duration
outputChan chan<- types.ServiceStatus
healthChan chan<- types.StageHealth
wg sync.WaitGroup
mu sync.Mutex
processed uint64
}
// NewSystemctlCollector creates a new collector for the given services.
func NewSystemctlCollector(
services []string,
interval time.Duration,
output chan<- types.ServiceStatus,
health chan<- types.StageHealth,
) *SystemctlCollector {
return &SystemctlCollector{
services: services,
interval: interval,
outputChan: output,
healthChan: health,
}
}
// Start launches the collection loop.
func (c *SystemctlCollector) Start(ctx context.Context) {
if len(c.services) == 0 {
log.Println("systemctl: no services configured for monitoring")
return
}
c.wg.Go(func() {
ticker := time.NewTicker(c.interval)
reportTicker := time.NewTicker(5 * time.Second)
defer ticker.Stop()
defer reportTicker.Stop()
// Immediate first collection.
c.collect()
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
c.collect()
case <-reportTicker.C:
c.emitHealth()
}
}
})
}
// Wait waits for the collector to stop.
func (c *SystemctlCollector) Wait() {
c.wg.Wait()
}
func (c *SystemctlCollector) collect() {
for _, service := range c.services {
status, err := c.getServiceStatus(service)
if err != nil {
log.Printf("systemctl: error getting status for %s: %v", service, err)
continue
}
select {
case c.outputChan <- status:
c.mu.Lock()
c.processed++
c.mu.Unlock()
default:
log.Printf("systemctl: output channel full dropping status for %s", service)
}
}
}
func (c *SystemctlCollector) getServiceStatus(service string) (types.ServiceStatus, error) {
// Use systemctl show to get machine-readable properties.
cmd := exec.Command("systemctl", "show", "-p", "ActiveState,SubState", service)
var out bytes.Buffer
cmd.Stdout = &out
if err := cmd.Run(); err != nil {
return types.ServiceStatus{}, err
}
lines := strings.Split(strings.TrimSpace(out.String()), "\n")
status := types.ServiceStatus{
Timestamp: time.Now(),
ServiceName: service,
}
for _, line := range lines {
parts := strings.SplitN(line, "=", 2)
if len(parts) != 2 {
continue
}
switch parts[0] {
case "ActiveState":
status.ActiveState = parts[1]
case "SubState":
status.SubState = parts[1]
}
}
return status, nil
}
func (c *SystemctlCollector) emitHealth() {
c.mu.Lock()
count := c.processed
c.mu.Unlock()
select {
case c.healthChan <- types.StageHealth{
StageName: "systemctl_collector",
EventsProcessed: count,
LastUpdate: time.Now(),
}:
default:
}
}

203
internal/config/config.go Normal file
View file

@ -0,0 +1,203 @@
// Package config provides the pipeline configuration loaded from YAML.
package config
import (
"fmt"
"os"
"regexp"
"time"
"gopkg.in/yaml.v3"
)
// MaskingPattern is a single entry in drain.masking_patterns.
type MaskingPattern struct {
Name string `yaml:"name"`
Pattern string `yaml:"pattern"`
Replace string `yaml:"replace"`
Type string `yaml:"type"`
Re *regexp.Regexp
}
// MADConfig defines parameters for the MAD detector.
type MADConfig struct {
// Threshold is the modified Z-score cutoff for IsAnomaly.
// Recommended: 3.04.0. Default: 3.5.
Threshold float64 `yaml:"threshold"`
// CalibrationSize is the number of NormalizedVectors to buffer before
// automatic per-feature median/MAD calibration runs.
// Default (if 0): 100.
CalibrationSize int `yaml:"calibration_size"`
}
// COPODConfig defines the parameters for the Copula-Based Outlier detector.
type COPODConfig struct {
Threshold float64 `yaml:"threshold"`
BufferSize int `yaml:"buffer_size"`
}
// RRCFConfig defines the parameters for the Robust Random Cut Forest detector.
// Used for the standalone RRCF detector and the classic AVG/MAX/MEDIAN ensemble paths.
type RRCFConfig struct {
NumTrees int `yaml:"num_trees"`
TreeSize int `yaml:"tree_size"`
ThresholdPercentile float64 `yaml:"threshold_percentile"`
}
// RRCFVariantConfig holds parameters for a single named RRCF variant inside
// the SEAD multi-horizon ensemble.
type RRCFVariantConfig struct {
// NumTrees controls score stability: more trees → smoother/conservative.
NumTrees int `yaml:"num_trees"`
// TreeSize sets the sliding-window capacity per tree.
TreeSize int `yaml:"tree_size"`
// ThresholdPercentile is the per-model decision threshold (standalone use).
ThresholdPercentile float64 `yaml:"threshold_percentile"`
}
// RRCFVariantsConfig groups the three RRCF variants used by the SEAD ensemble.
// Each variant captures anomalies at a different time-horizon:
// - Fast: short memory, reactive to transient spikes
// - Mid: medium memory, balanced sensitivity
// - Slow: long memory, detects sustained / slow-drift events
type RRCFVariantsConfig struct {
Fast RRCFVariantConfig `yaml:"fast"`
Mid RRCFVariantConfig `yaml:"mid"`
Slow RRCFVariantConfig `yaml:"slow"`
}
// SEADConfig holds tunable parameters for the SEAD ensemble.
// Only used when EnsembleConfig.Method == "sead".
type SEADConfig struct {
// Eta is the MWU learning rate η ∈ (0, 1].
// Higher values react faster to distribution shifts but are noisier.
// Recommended: 0.050.20. Default (if 0): 0.10.
Eta float64 `yaml:"eta"`
// Lambda is the KL-divergence regularisation strength.
// 0 = pure MWU (uniform prior). Recommended: 0.00.05. Default: 0.01.
Lambda float64 `yaml:"lambda"`
// QuantileWindow is the number of past scores retained per detector for
// streaming quantile normalisation. Default (if 0): 300.
QuantileWindow int `yaml:"quantile_window"`
// MinDataPoints is the cold-start guard: no anomaly is flagged until at
// least this many windows have been scored. Default (if 0): 20.
MinDataPoints int `yaml:"min_data_points"`
}
// EnsembleConfig manages the routing for the multi-model detector.
type EnsembleConfig struct {
Enabled bool `yaml:"enabled"`
// Method selects the score-aggregation strategy.
// Allowed values: "avg" (default), "max", "median", "sead".
// "sead": adaptive Multiplicative Weights Update ensemble (Shah et al., ICML 2025).
Method string `yaml:"method"`
// Contamination is the expected fraction of anomalous windows ∈ [0, 0.5).
// Determines the decision threshold as quantile(1-contamination) of
// the rolling combined score history.
Contamination float64 `yaml:"contamination"`
// SEAD tuning parameters (only applied when Method == "sead").
SEAD SEADConfig `yaml:"sead"`
}
// AutoScalingConfig holds thresholds and durations for dynamic detector switching.
type AutoScalingConfig struct {
Enabled bool `yaml:"enabled"`
HighThreshold float64 `yaml:"high_threshold"` // e.g. 0.75 (Normal -> High)
CritThreshold float64 `yaml:"critical_threshold"` // e.g. 0.90 (High -> Critical)
HighDuration float64 `yaml:"high_duration"` // e.g. 30.0 (seconds)
CritDuration float64 `yaml:"critical_duration"` // e.g. 15.0 (seconds)
DownThreshold float64 `yaml:"down_threshold"` // e.g. 0.50 (back to Normal)
DownDuration float64 `yaml:"down_duration"` // e.g. 60.0 (seconds)
}
// DetectorConfig groups all anomaly detection configurations.
type DetectorConfig struct {
Method string `yaml:"method"`
Ensemble EnsembleConfig `yaml:"ensemble"`
MAD MADConfig `yaml:"mad"`
COPOD COPODConfig `yaml:"copod"`
// RRCF is used by the standalone detector and the AVG/MAX/MEDIAN ensemble paths.
RRCF RRCFConfig `yaml:"rrcf"`
// RRCFVariants configures the three-horizon RRCF instances for the SEAD ensemble.
// Defaults are applied automatically when fields are zero.
RRCFVariants RRCFVariantsConfig `yaml:"rrcf_variants"`
AutoScaling AutoScalingConfig `yaml:"auto_scaling"`
}
// Config is the top-level pipeline configuration.
type Config struct {
Ingestion struct {
LogPath string `yaml:"log_path"`
NetInterface string `yaml:"net_interface"`
DiskDevice string `yaml:"disk_device"`
SystemctlServices []string `yaml:"systemctl_services"`
} `yaml:"ingestion"`
Transformation struct {
WindowSize time.Duration `yaml:"window_size"`
DbPath string `yaml:"db_path"`
} `yaml:"transformation"`
Drain struct {
Depth int `yaml:"depth"`
SimThreshold float64 `yaml:"sim_threshold"`
MaxChildren int `yaml:"max_children"`
MaxClusters int `yaml:"max_clusters"`
MaskingPatterns []MaskingPattern `yaml:"masking_patterns"`
} `yaml:"drain"`
Detection DetectorConfig `yaml:"detector"`
Output struct {
FeatureLogPath string `yaml:"feature_log_path"`
AnomalyLogPath string `yaml:"anomaly_log_path"`
} `yaml:"output"`
}
// LoadConfig reads and decodes the YAML file at path.
func LoadConfig(path string) (*Config, error) {
f, err := os.Open(path)
if err != nil {
return nil, fmt.Errorf("config: open %q: %w", path, err)
}
defer f.Close()
var cfg Config
dec := yaml.NewDecoder(f)
dec.KnownFields(false)
if err := dec.Decode(&cfg); err != nil {
return nil, fmt.Errorf("config: decode %q: %w", path, err)
}
return &cfg, nil
}
// Compile compiles all MaskingPattern.Pattern strings into *regexp.Regexp.
func (c *Config) Compile() error {
for i := range c.Drain.MaskingPatterns {
mp := &c.Drain.MaskingPatterns[i]
re, err := regexp.Compile(mp.Pattern)
if err != nil {
return fmt.Errorf("config: compile pattern %q: %w", mp.Name, err)
}
mp.Re = re
}
return nil
}
// NumericPatternNames returns the ordered list of MaskingPattern names whose
// Type is "float" or "int".
func (c *Config) NumericPatternNames() []string {
names := make([]string, 0, len(c.Drain.MaskingPatterns))
for _, mp := range c.Drain.MaskingPatterns {
if mp.Name != "" && (mp.Type == "float" || mp.Type == "int") {
names = append(names, mp.Name)
}
}
return names
}

98
internal/detect/copod.go Normal file
View file

@ -0,0 +1,98 @@
// Package detect provides anomaly detection algorithms and ensemble logic.
package detect
import (
"fmt"
"log"
"codeberg.org/pata1704/copod"
"codeberg.org/pata1704/guenther/pkg/types"
)
// COPODDetector implements the AnomalyDetector interface by wrapping the
// external codeberg.org/pata1704/copod package.
//
// Streaming mode: Score calls Update internally, so the sliding-window buffer
// stays current without requiring a separate Update call. Callers (like SEAD)
// only need to call Score per time step.
//
// Fit seeds the buffer with a batch of normal vectors. If Fit is not called
// the detector starts cold and returns score=0 until the buffer has enough
// points (controlled by bufferSize in the underlying library).
type COPODDetector struct {
detector *copod.Detector
}
// NewCOPODDetector initialises the streaming COPOD detector wrapper.
//
// - bufferSize: sliding-window capacity. Recommended: 100200.
// - threshold: score cutoff for standalone IsAnomaly. When used inside
// SEAD the threshold is ignored (SEAD applies its own adaptive threshold).
func NewCOPODDetector(bufferSize int, threshold float64) (*COPODDetector, error) {
det, err := copod.NewDetector(bufferSize, threshold)
if err != nil {
return nil, fmt.Errorf("copod: initialize wrapped detector: %w", err)
}
return &COPODDetector{
detector: det,
}, nil
}
// Fit seeds the COPOD history buffer with a slice of labelled-normal vectors.
func (c *COPODDetector) Fit(vectors []types.FeatureVector) error {
for _, v := range vectors {
if err := c.update(v); err != nil {
return err
}
}
return nil
}
// Update adds a single observation to the sliding window.
// Safe to call concurrently with Score.
func (c *COPODDetector) Update(vector types.FeatureVector) error {
return c.update(vector)
}
// Score computes the COPOD anomaly score for the given vector and
// simultaneously updates the internal sliding window with the scored vector.
//
// The self-update ensures COPOD's buffer reflects the current data stream
// without requiring a separate Update call after every Score. This is
// consistent with the RRCF and IsolationForest detectors which also
// update themselves inside Score.
func (c *COPODDetector) Score(vector types.FeatureVector) (types.AnomalyResult, error) {
vec := copod.FeatureVector{
NormalizedVector: vector.NormalizedVector,
Timestamp: vector.Timestamp,
}
// Score first, then append to the buffer so the scored point does not
// bias its own copula calculation (score-then-insert, same as RRCF).
res, err := c.detector.Score(vec)
if err != nil {
return types.AnomalyResult{}, fmt.Errorf("copod: score: %w", err)
}
if err := c.update(vector); err != nil {
// Log but don't fail: the score is already computed.
log.Printf("copod: update after score: %v", err)
}
return types.AnomalyResult{
Timestamp: res.Timestamp,
Score: res.Score,
IsAnomaly: res.IsAnomaly,
Confidence: res.Confidence,
Method: res.Method,
}, nil
}
// update is the internal helper that adds vector to the copod sliding window.
func (c *COPODDetector) update(vector types.FeatureVector) error {
vec := copod.FeatureVector{
NormalizedVector: vector.NormalizedVector,
Timestamp: vector.Timestamp,
}
return c.detector.Update(vec)
}

325
internal/detect/ensemble.go Normal file
View file

@ -0,0 +1,325 @@
// Package detect provides anomaly detection algorithms and ensemble logic.
package detect
import (
"fmt"
"math"
"sort"
"strings"
"sync"
"codeberg.org/pata1704/guenther/pkg/types"
)
// EnsembleMethod selects the score-aggregation strategy used by EnsembleDetector.
type EnsembleMethod string
const (
// EnsembleAVG combines normalised sub-scores by arithmetic mean.
EnsembleAVG EnsembleMethod = "avg"
// EnsembleMAX takes the maximum of the normalised sub-scores (aggressive).
EnsembleMAX EnsembleMethod = "max"
// EnsembleMEDIAN uses the median of normalised sub-scores (robust to outliers).
EnsembleMEDIAN EnsembleMethod = "median"
// EnsembleSEAD delegates to an embedded SEADDetector (adaptive MWU weights).
// This method is selected by setting detector.ensemble.method = "sead" in
// the config. The four base detectors (MAD, RRCF, COPOD, IForest) are
// instantiated with the same parameters as the non-SEAD ensemble paths and
// the SEAD wrapper handles the online weight updates automatically.
EnsembleSEAD EnsembleMethod = "sead"
)
// RRCFVariantConfig holds parameters for a single named RRCF instance in the
// SEAD multi-horizon ensemble.
type RRCFVariantConfig struct {
// NumTrees controls score stability: more trees → smoother / more conservative.
NumTrees int
// TreeSize is the sliding-window capacity per tree.
TreeSize int
// ThresholdPercentile is the per-model decision threshold for standalone use.
ThresholdPercentile float64
}
// RRCFVariantsConfig groups the three RRCF horizon variants used by the SEAD ensemble.
// - Fast: short memory, reactive to transient spikes
// - Mid: medium memory, balanced sensitivity
// - Slow: long memory, detects sustained / slow-drift events
type RRCFVariantsConfig struct {
Fast RRCFVariantConfig
Mid RRCFVariantConfig
Slow RRCFVariantConfig
}
// EnsembleDetector implements the AnomalyDetector interface by combining
// COPOD and RRCF scores using min-max normalisation.
//
// Scoring strategy (AVG / MAX / MEDIAN methods):
// 1. Each model produces a raw score on its own scale.
// 2. Both scores are normalised to [0, 1] using a rolling min/max window.
// 3. The combined score is the result of the selected aggregation function.
// 4. A window is flagged anomalous when combinedScore > threshold where
// threshold = quantile(combinedHistory, 1-contamination).
//
// SEAD method:
//
// When method == EnsembleSEAD the detector delegates entirely to an embedded
// SEADDetector which wraps all four base detectors and uses Multiplicative
// Weights Update (MWU/FTRL) to adapt weights online. The COPOD and RRCF
// sub-detectors passed to NewEnsembleDetector are still created but are only
// used when method != EnsembleSEAD.
type EnsembleDetector struct {
method EnsembleMethod
// sub-detectors for AVG/MAX/MEDIAN methods
copod AnomalyDetector
rrcf AnomalyDetector
// SEAD method: fully adaptive ensemble (replaces copod+rrcf when active)
sead *SEADDetector
contamination float64
mu sync.Mutex
copodHistory []float64
rrcfHistory []float64
combinedHistory []float64
historySize int
}
// NewEnsembleDetector initialises the multi-model ensemble.
//
// - method: "avg" | "max" | "median" | "sead"
// - copodBufferSize: sliding-window capacity for COPOD (≥ 100 recommended).
// - copodThreshold: per-model threshold passed to COPODDetector.
// - rrcfVariants: three-horizon RRCF config (fast/mid/slow). Used by SEAD;
// the Mid variant is also used for the classic AVG/MAX/MEDIAN path.
// - contamination: expected fraction of anomalies ∈ [0, 0.5).
// - seadCfg: SEAD parameters (only used when method == "sead").
// Pass detect.DefaultSEADConfig() when method != "sead".
func NewEnsembleDetector(
method EnsembleMethod,
copodBufferSize int, copodThreshold float64,
rrcfVariants RRCFVariantsConfig,
contamination float64,
seadCfg SEADConfig,
) (*EnsembleDetector, error) {
e := &EnsembleDetector{
method: method,
contamination: contamination,
historySize: 1000,
}
if method == EnsembleSEAD {
// Delegate to SEADDetector with all six base detectors (3 RRCF horizons).
// MAD is bootstrapped with identity priors (median=0, MAD=1); it will
// calibrate itself during the pipeline warm-up phase.
sead, err := NewSEADWithAllDetectors(
copodBufferSize, copodThreshold,
rrcfVariants,
3.5, 0, // madThreshold=3.5, madCalibSize=0→default 100 vectors
seadCfg,
)
if err != nil {
return nil, fmt.Errorf("ensemble: sead: %w", err)
}
e.sead = sead
} else {
// Classic AVG/MAX/MEDIAN path: only COPOD + RRCF (Mid variant as default).
copodDet, err := NewCOPODDetector(copodBufferSize, copodThreshold)
if err != nil {
return nil, fmt.Errorf("ensemble: %w", err)
}
e.copod = copodDet
// Use Mid variant defaults for the classic ensemble path.
midTrees := rrcfVariants.Mid.NumTrees
if midTrees == 0 {
midTrees = 150
}
midSize := rrcfVariants.Mid.TreeSize
if midSize == 0 {
midSize = 64
}
midPct := rrcfVariants.Mid.ThresholdPercentile
if midPct == 0 {
midPct = 0.85
}
e.rrcf = NewRRCFDetector(midTrees, midSize, 0, midPct)
}
return e, nil
}
// SEAD returns the underlying SEADDetector if the ensemble is in SEAD mode.
func (e *EnsembleDetector) SEAD() *SEADDetector {
e.mu.Lock()
defer e.mu.Unlock()
return e.sead
}
// Fit seeds the underlying models from a slice of feature vectors.
func (e *EnsembleDetector) Fit(vectors []types.FeatureVector) error {
if e.method == EnsembleSEAD {
return e.sead.Fit(vectors)
}
if err := e.copod.Fit(vectors); err != nil {
return fmt.Errorf("ensemble: fit copod: %w", err)
}
if err := e.rrcf.Fit(vectors); err != nil {
return fmt.Errorf("ensemble: fit rrcf: %w", err)
}
return nil
}
// Update propagates the vector to the underlying models.
func (e *EnsembleDetector) Update(vector types.FeatureVector) error {
if e.method == EnsembleSEAD {
return e.sead.Update(vector)
}
if err := e.copod.Update(vector); err != nil {
return fmt.Errorf("ensemble: update copod: %w", err)
}
if err := e.rrcf.Update(vector); err != nil {
return fmt.Errorf("ensemble: update rrcf: %w", err)
}
return nil
}
// Score evaluates the feature vector.
//
// For SEAD method: delegates entirely to the embedded SEADDetector.
// For AVG/MAX/MEDIAN: min-max normalises COPOD and RRCF scores and aggregates.
func (e *EnsembleDetector) Score(vector types.FeatureVector) (types.AnomalyResult, error) {
if e.method == EnsembleSEAD {
res, err := e.sead.Score(vector)
if err != nil {
return types.AnomalyResult{}, fmt.Errorf("ensemble: sead score: %w", err)
}
return res, nil
}
resCOPOD, err := e.copod.Score(vector)
if err != nil {
return types.AnomalyResult{}, fmt.Errorf("ensemble: score copod: %w", err)
}
resRRCF, err := e.rrcf.Score(vector)
if err != nil {
return types.AnomalyResult{}, fmt.Errorf("ensemble: score rrcf: %w", err)
}
e.mu.Lock()
defer e.mu.Unlock()
e.appendHistory(&e.copodHistory, resCOPOD.Score)
e.appendHistory(&e.rrcfHistory, resRRCF.Score)
normCOPOD := minMaxNorm(resCOPOD.Score, e.copodHistory)
normRRCF := minMaxNorm(resRRCF.Score, e.rrcfHistory)
var combined float64
switch e.method {
case EnsembleMAX:
combined = math.Max(normCOPOD, normRRCF)
case EnsembleMEDIAN:
// Median of two values = average; kept for future N>2 extension.
vals := []float64{normCOPOD, normRRCF}
sort.Float64s(vals)
combined = vals[len(vals)/2]
default: // EnsembleAVG
combined = (normCOPOD + normRRCF) / 2.0
}
e.appendHistory(&e.combinedHistory, combined)
const minDataPoints = 10
threshold := quantile(e.combinedHistory, 1.0-e.contamination)
isAnomaly := len(e.combinedHistory) > minDataPoints && combined > threshold
return types.AnomalyResult{
Timestamp: vector.Timestamp,
Score: combined,
IsAnomaly: isAnomaly,
Confidence: math.Min(combined/math.Max(threshold, 1e-9), 1.0),
Method: e.methodString(string(e.method), resCOPOD.IsAnomaly, resRRCF.IsAnomaly),
}, nil
}
// WeightSummary returns the current SEAD detector weights as a human-readable
// string. Returns "" when the ensemble is not using SEAD.
func (e *EnsembleDetector) WeightSummary() string {
if e.method != EnsembleSEAD || e.sead == nil {
return ""
}
return e.sead.WeightSummary()
}
// appendHistory appends v to *h, evicting the oldest entry when full.
// Caller must hold e.mu.
func (e *EnsembleDetector) appendHistory(h *[]float64, v float64) {
*h = append(*h, v)
if len(*h) > e.historySize {
*h = (*h)[1:]
}
}
// methodString builds a concise label for AnomalyResult.Method.
func (e *EnsembleDetector) methodString(method string, copodAnomaly, rrcfAnomaly bool) string {
var active []string
if copodAnomaly {
active = append(active, "COPOD")
}
if rrcfAnomaly {
active = append(active, "RRCF")
}
if len(active) > 0 {
return fmt.Sprintf("Ensemble-%s(%s)", strings.ToUpper(method), strings.Join(active, "+"))
}
return fmt.Sprintf("Ensemble-%s(none)", strings.ToUpper(method))
}
// ── score helpers ─────────────────────────────────────────────────────────────
// minMaxNorm normalises v into [0, 1] using the observed min/max of history.
func minMaxNorm(v float64, history []float64) float64 {
if len(history) == 0 {
return 0
}
minV, maxV := history[0], history[0]
for _, h := range history[1:] {
if h < minV {
minV = h
}
if h > maxV {
maxV = h
}
}
spread := maxV - minV
if spread < 1e-12 {
return 0.5
}
norm := (v - minV) / spread
if norm < 0 {
return 0
}
if norm > 1 {
return 1
}
return norm
}
// quantile returns the p-th quantile of data without modifying the slice.
func quantile(data []float64, p float64) float64 {
n := len(data)
if n == 0 {
return 0
}
sorted := make([]float64, n)
copy(sorted, data)
sort.Float64s(sorted)
idx := int(float64(n) * p)
if idx >= n {
idx = n - 1
}
return sorted[idx]
}

200
internal/detect/iforest.go Normal file
View file

@ -0,0 +1,200 @@
package detect
import (
"log"
"sync"
"codeberg.org/pata1704/guenther/pkg/types"
"github.com/e-XpertSolutions/go-iforest/iforest"
)
// IsolationForestDetector wraps go-iforest with thread-safe access and
// continuous background retraining on non-anomalous data to handle concept drift.
//
// During the warmup phase (model == nil) incoming vectors are buffered.
// Once warmupSize vectors have accumulated, the first training run executes
// synchronously so that the detector is never in an undefined trained state
// after the first window tick.
//
// Subsequent retraining is asynchronous: when trainingBuffer reaches
// bufferSize the buffer is swapped out under the lock, and training runs in
// a detached goroutine. The current model remains active during retraining,
// so scoring never blocks.
type IsolationForestDetector struct {
mu sync.RWMutex
model *iforest.Forest
trainingBuffer []types.FeatureVector
// Tuning knobs set via constructor.
numTrees int
subSample int
contamination float64
bufferSize int
warmupSize int
threshold float64
}
// NewIsolationForestDetector creates a detector with the given parameters.
//
// - bufferSize: number of non-anomalous vectors to accumulate before
// triggering background retraining.
// - warmupSize: number of vectors to accumulate before the first (sync)
// training run. Must be ≤ bufferSize.
// - numTrees: number of isolation trees (typically 100).
// - subSample: subsample size per tree (typically 256).
// - contamination: expected fraction of anomalies (0 < c < 0.5).
// - threshold: score cutoff for IsAnomaly.
func NewIsolationForestDetector(
bufferSize, warmupSize, numTrees, subSample int,
contamination, threshold float64,
) *IsolationForestDetector {
if warmupSize <= 0 || warmupSize > bufferSize {
warmupSize = bufferSize
}
return &IsolationForestDetector{
bufferSize: bufferSize,
warmupSize: warmupSize,
numTrees: numTrees,
subSample: subSample,
contamination: contamination,
threshold: threshold,
}
}
// Fit trains a new Isolation Forest on vectors.
// Fit is safe to call concurrently with Score (uses a write lock only while
// swapping the model pointer).
func (d *IsolationForestDetector) Fit(vectors []types.FeatureVector) error {
if len(vectors) == 0 {
return nil
}
data := convertToMatrix(vectors)
forest := iforest.NewForest(d.numTrees, d.subSample, d.contamination)
forest.Train(data)
forest.Test(data)
d.mu.Lock()
d.model = forest
d.mu.Unlock()
log.Printf("iforest: trained on %d samples (trees=%d, subsample=%d, contamination=%.3f)",
len(vectors), d.numTrees, d.subSample, d.contamination)
return nil
}
// Score returns an AnomalyResult for vector.
//
// Pre-model (warmup) behaviour:
// - Vector is appended to trainingBuffer.
// - Once warmupSize is reached the first training run executes synchronously
// on the calling goroutine so subsequent Score calls have a model.
// - Returns score=0, IsAnomaly=false while warming up.
//
// Post-model behaviour:
// - Score is computed via the active model (read-lock only).
// - Non-anomalous vectors are appended to trainingBuffer.
// - When trainingBuffer reaches bufferSize, a background retrain fires.
func (d *IsolationForestDetector) Score(vector types.FeatureVector) (types.AnomalyResult, error) {
warmup := types.AnomalyResult{
Timestamp: vector.Timestamp,
Score: 0,
IsAnomaly: false,
Method: "IF",
}
// ── warmup phase ──────────────────────────────────────────────────────
d.mu.RLock()
model := d.model
d.mu.RUnlock()
if model == nil {
d.mu.Lock()
d.trainingBuffer = append(d.trainingBuffer, vector)
bufLen := len(d.trainingBuffer)
d.mu.Unlock()
if bufLen < d.warmupSize {
return warmup, nil
}
// Synchronous first fit to eliminate the cold-start gap.
d.mu.Lock()
buf := d.trainingBuffer
d.trainingBuffer = nil
d.mu.Unlock()
if err := d.Fit(buf); err != nil {
return warmup, err
}
d.mu.RLock()
model = d.model
d.mu.RUnlock()
if model == nil {
return warmup, nil // Fit failed silently defensive
}
}
// ── inference ─────────────────────────────────────────────────────────
_, scores, err := model.Predict([][]float64{vector.NormalizedVector})
if err != nil {
return warmup, err
}
if len(scores) == 0 {
return warmup, nil
}
score := scores[0]
res := types.AnomalyResult{
Timestamp: vector.Timestamp,
Score: score,
IsAnomaly: score > d.threshold,
Confidence: score,
Method: "IF",
}
// Buffer non-anomalous vectors for background retraining.
if !res.IsAnomaly {
if err := d.Update(vector); err != nil {
log.Printf("iforest: update buffer: %v", err)
}
}
return res, nil
}
// Update appends a non-anomalous vector to the training buffer.
// If the buffer is full it is swapped atomically and a background goroutine
// retrains the model on the captured data.
func (d *IsolationForestDetector) Update(vector types.FeatureVector) error {
d.mu.Lock()
d.trainingBuffer = append(d.trainingBuffer, vector)
if len(d.trainingBuffer) < d.bufferSize {
d.mu.Unlock()
return nil
}
buf := make([]types.FeatureVector, len(d.trainingBuffer))
copy(buf, d.trainingBuffer)
d.trainingBuffer = nil
d.mu.Unlock()
go func() {
if err := d.Fit(buf); err != nil {
log.Printf("iforest: background retrain: %v", err)
}
}()
return nil
}
// ── helpers ───────────────────────────────────────────────────────────────────
func convertToMatrix(vectors []types.FeatureVector) [][]float64 {
m := make([][]float64, len(vectors))
for i, v := range vectors {
m[i] = v.NormalizedVector
}
return m
}

View file

@ -0,0 +1,148 @@
package detect
import (
"context"
"log"
"sync"
"time"
"codeberg.org/pata1704/guenther/pkg/types"
)
// AnomalyDetector is the common interface for all detection algorithms.
// Implementations must be safe for concurrent use.
type AnomalyDetector interface {
// Fit trains the model on the supplied slice of labelled-normal vectors.
Fit(vectors []types.FeatureVector) error
// Score returns an anomaly assessment for vector. It must not block.
Score(vector types.FeatureVector) (types.AnomalyResult, error)
// Update buffers vector for incremental model updates.
Update(vector types.FeatureVector) error
}
// DetectionLayer reads FeatureVectors from inputChan, scores them with the
// configured AnomalyDetector, and forwards AnomalyResults to outputChan.
//
// The layer runs a single event-loop goroutine (no additional worker pool is
// needed because detection is CPU-bound in a single model, not I/O-bound).
// Health metrics are emitted to healthChan every 5 seconds.
//
// Backpressure: if outputChan is full the result is dropped and a warning is
// logged. This prevents the detection goroutine from blocking the upstream
// TransformEngine via backpressure handling.
type DetectionLayer struct {
detector AnomalyDetector
inputChan <-chan types.FeatureVector
outputChan chan<- types.AnomalyResult
healthChan chan<- types.StageHealth
scalingController *ScalingController // optional
wg sync.WaitGroup
mu sync.Mutex
processed uint64
dropped uint64
avgLatency float64
}
// NewDetectionLayer constructs a DetectionLayer wired to the given channels.
func NewDetectionLayer(
detector AnomalyDetector,
input <-chan types.FeatureVector,
output chan<- types.AnomalyResult,
health chan<- types.StageHealth,
) *DetectionLayer {
return &DetectionLayer{
detector: detector,
inputChan: input,
outputChan: output,
healthChan: health,
}
}
// SetScalingController attaches an auto-scaling controller to the layer.
func (l *DetectionLayer) SetScalingController(sc *ScalingController) {
l.scalingController = sc
}
// Start launches the detection event loop in a background goroutine.
// The method is idempotent: calling Start twice panics (close of closed channel).
func (l *DetectionLayer) Start(ctx context.Context) {
l.wg.Go(func() {
reportTicker := time.NewTicker(5 * time.Second)
defer reportTicker.Stop()
for {
select {
case fv := <-l.inputChan:
l.handle(fv)
case <-reportTicker.C:
l.emitHealth()
case <-ctx.Done():
return
}
}
})
}
// Wait waits for the event loop to exit after context cancellation.
func (l *DetectionLayer) Wait() {
l.wg.Wait()
}
func (l *DetectionLayer) handle(fv types.FeatureVector) {
if l.scalingController != nil {
l.scalingController.ObserveCPU(fv.AvgCPUPercent)
}
start := time.Now()
result, err := l.detector.Score(fv)
ms := time.Since(start).Seconds() * 1e3
l.mu.Lock()
l.processed++
if l.avgLatency == 0 {
l.avgLatency = ms
} else {
l.avgLatency = l.avgLatency*0.8 + ms*0.2
}
l.mu.Unlock()
if err != nil {
log.Printf("detection: score error: %v", err)
return
}
select {
case l.outputChan <- result:
default:
l.mu.Lock()
l.dropped++
l.mu.Unlock()
log.Printf("detection: output channel full dropping result (score=%.4f)", result.Score)
}
}
// emitHealth sends a StageHealth snapshot to healthChan.
// Non-blocking: skips the report if healthChan is full.
func (l *DetectionLayer) emitHealth() {
l.mu.Lock()
p := l.processed
d := l.dropped
avg := l.avgLatency
l.mu.Unlock()
select {
case l.healthChan <- types.StageHealth{
StageName: "detection_layer",
EventsProcessed: p,
EventsDropped: d,
AvgLatencyMs: avg,
LastUpdate: time.Now(),
}:
default:
}
}

254
internal/detect/mad.go Normal file
View file

@ -0,0 +1,254 @@
// Package detect provides anomaly detection algorithms and ensemble logic.
package detect
import (
"log"
"math"
"sort"
"sync"
"codeberg.org/pata1704/guenther/pkg/types"
)
// MADDetector scores feature vectors using per-feature Median Absolute
// Deviation (MAD) with pre-calibrated or automatically derived statistics.
//
// Pass nil for medians and mads and set calibrationSize > 0 via
// NewMADDetectorAutoCalibrate. The detector buffers the first calibrationSize
// NormalizedVectors, computes per-feature statistics once the buffer is full,
// and starts scoring normally afterwards. During the warmup phase Score
// returns score=0 / IsAnomaly=false.
//
// detector := NewMADDetectorAutoCalibrate(3.5, 100)
//
// SEAD down-weights MAD automatically during the warmup phase because
// all scores are zero; once calibration completes SEAD will start to
// consider MAD scores in its weight updates.
//
// # Calibration contract
//
// The medians and mads slices must be computed from the SAME representation
// that arrives in vector.NormalizedVector i.e. from the RobustScaler-scaled
// feature vectors, NOT from raw window aggregates.
//
// # Scoring
//
// For each feature i the modified Z-score is:
//
// score_i = |x_i - median_i| / (1.4826 * MAD_i)
//
// The constant 1.4826 ≈ 1/(Φ⁻¹(3/4)) makes MAD a consistent estimator of σ
// under normality (Rousseeuw & Croux, 1993). The anomaly score is the maximum
// modified Z-score across all features.
//
// # Fit / Update
//
// When calibration is already complete, Fit replaces the
// current statistics with values derived from the supplied vectors. Update is a
// no-op.
type MADDetector struct {
mu sync.Mutex
threshold float64
medians []float64 // per-feature median of NormalizedVector in baseline
mads []float64 // per-feature MAD of NormalizedVector in baseline
// Auto-calibration state. calibrationSize == 0 means disabled.
calibrationSize int
calibrationBuf [][]float64 // collected NormalizedVectors during warmup
calibrated bool
}
// NewMADDetector creates a MADDetector with pre-calibrated baseline statistics.
//
// - threshold: anomaly score cutoff (modified Z-score). Typical: 2.54.0.
// - medians: per-feature median computed from NormalizedVector in baseline.
// - mads: per-feature MAD computed from NormalizedVector in baseline.
// Zero entries are replaced with 1.0 to avoid division-by-zero.
//
// Pass nil for medians and mads only when calibrationSize > 0 is set via
// NewMADDetectorAutoCalibrate; otherwise all scores will be zero.
func NewMADDetector(threshold float64, medians, mads []float64) *MADDetector {
return &MADDetector{
threshold: threshold,
medians: medians,
mads: mads,
calibrated: len(medians) > 0,
}
}
// NewMADDetectorAutoCalibrate creates a MADDetector that derives its own
// per-feature statistics from the first calibrationSize NormalizedVectors
// it encounters in Score.
//
// - threshold: modified Z-score cutoff after calibration. Typical: 3.5.
// - calibrationSize: number of vectors to buffer before first calibration.
// Recommended: 60200
func NewMADDetectorAutoCalibrate(threshold float64, calibrationSize int) *MADDetector {
if calibrationSize <= 0 {
calibrationSize = 100
}
// Initialise with "Identity" stats (median=0, mad=1) so the detector is
// operational immediately with a global sensitivity of 1.0 (baseline IQR).
// Features are already RobustScaled by DuckDB, so this is a sane prior.
// Automatic calibration will refine these once the buffer is full.
return &MADDetector{
threshold: threshold,
calibrationSize: calibrationSize,
medians: nil, // will be Lazy-init or from buffer
mads: nil,
}
}
// Fit recomputes per-feature median and MAD from the supplied vectors,
// replacing any prior calibration. Safe to call concurrently with Score.
func (m *MADDetector) Fit(vectors []types.FeatureVector) error {
if len(vectors) == 0 {
return nil
}
raw := make([][]float64, len(vectors))
for i, v := range vectors {
raw[i] = v.NormalizedVector
}
medians, mads := computeMADStats(raw)
m.mu.Lock()
m.medians = medians
m.mads = mads
m.calibrated = true
m.calibrationBuf = nil
m.mu.Unlock()
log.Printf("mad: fitted on %d vectors (%d features)", len(vectors), len(medians))
return nil
}
// Update is a no-op when manual statistics are used. When auto-calibration is
// active it is equivalent to calling Score but discards the result.
func (m *MADDetector) Update(v types.FeatureVector) error {
_, _ = m.Score(v)
return nil
}
// Score computes the maximum modified Z-score across all features of vector.
//
// During the auto-calibration warmup the vector is buffered and a zero-score
// result is returned. Once the calibration buffer is full the statistics are
// derived automatically and scoring starts on the next call.
//
// vector.NormalizedVector must contain values on the same scale as the
// medians and mads slices (i.e. RobustScaler-scaled values from DuckDB).
func (m *MADDetector) Score(vector types.FeatureVector) (types.AnomalyResult, error) {
m.mu.Lock()
// ── Auto-calibration warmup ───────────────────────────────────────────
if !m.calibrated && m.calibrationSize > 0 {
if vec := vector.NormalizedVector; len(vec) > 0 {
cp := make([]float64, len(vec))
copy(cp, vec)
m.calibrationBuf = append(m.calibrationBuf, cp)
}
if len(m.calibrationBuf) >= m.calibrationSize {
m.medians, m.mads = computeMADStats(m.calibrationBuf)
m.calibrated = true
m.calibrationBuf = nil
log.Printf("mad: auto-calibrated on %d vectors (%d features)",
m.calibrationSize, len(m.medians))
}
if !m.calibrated {
m.mu.Unlock()
return m.scoreIdentity(vector), nil
}
}
medians := m.medians
mads := m.mads
m.mu.Unlock()
// ── Scoring ───────────────────────────────────────────────────────────
maxScore := 0.0
for i, val := range vector.NormalizedVector {
if i >= len(medians) || i >= len(mads) {
break
}
// Stability floor: prevent explosive Z-scores for features with near-zero variance.
// 1e-2 corresponds to 1% of the original baseline IQR.
mad := math.Max(mads[i], 0.01)
// 1.4826 converts MAD to an estimator of standard deviation.
score := math.Abs(val-medians[i]) / (1.4826 * mad)
if score > maxScore {
maxScore = score
}
}
return types.AnomalyResult{
Timestamp: vector.Timestamp,
Score: maxScore,
IsAnomaly: maxScore > m.threshold,
Confidence: math.Min(maxScore/math.Max(m.threshold, 1e-9), 1.0),
Method: "MAD",
}, nil
}
// scoreIdentity provides a sane fallback (median=0, mad=1) for pre-scaled data.
func (m *MADDetector) scoreIdentity(vector types.FeatureVector) types.AnomalyResult {
maxScore := 0.0
for _, val := range vector.NormalizedVector {
score := math.Abs(val) / 0.6745 // 1/1.4826
if score > maxScore {
maxScore = score
}
}
res := types.AnomalyResult{
Timestamp: vector.Timestamp,
Score: maxScore,
IsAnomaly: maxScore > m.threshold,
Confidence: math.Min(maxScore/math.Max(m.threshold, 1e-9), 1.0),
Method: "MAD (warmup)",
}
if res.IsAnomaly {
res.Details = "Detected during MAD auto-calibration warmup period (using identity prior)."
}
return res
}
// ── calibration helper ────────────────────────────────────────────────────────
// computeMADStats returns per-feature median and MAD for a matrix of row vectors.
// Both slices have length equal to the number of features (columns).
func computeMADStats(rows [][]float64) (medians, mads []float64) {
if len(rows) == 0 {
return nil, nil
}
nFeatures := len(rows[0])
medians = make([]float64, nFeatures)
mads = make([]float64, nFeatures)
col := make([]float64, len(rows))
devs := make([]float64, len(rows))
for f := range nFeatures {
for r, row := range rows {
if f < len(row) {
col[r] = row[f]
}
}
med := median(col)
medians[f] = med
for r, v := range col {
devs[r] = math.Abs(v - med)
}
mads[f] = median(devs)
}
return medians, mads
}
// median returns the median of xs. xs is modified in-place (sorted).
func median(xs []float64) float64 {
n := len(xs)
if n == 0 {
return 0
}
sort.Float64s(xs)
if n%2 == 1 {
return xs[n/2]
}
return (xs[n/2-1] + xs[n/2]) / 2.0
}

114
internal/detect/mad_test.go Normal file
View file

@ -0,0 +1,114 @@
package detect
import (
"testing"
"time"
"codeberg.org/pata1704/guenther/pkg/types"
"github.com/stretchr/testify/assert"
)
func TestMADDetector_Score(t *testing.T) {
detector := NewMADDetector(3.0, []float64{10.0}, []float64{1.0})
// 1. Score a normal value
res, err := detector.Score(types.FeatureVector{
Timestamp: time.Now(),
NormalizedVector: []float64{11},
})
assert.NoError(t, err)
assert.False(t, res.IsAnomaly, "Value 11 should not be an anomaly")
// 2. Score an extreme outlier
res, err = detector.Score(types.FeatureVector{
Timestamp: time.Now(),
NormalizedVector: []float64{100},
})
assert.NoError(t, err)
assert.True(t, res.IsAnomaly, "Value 100 should be an anomaly")
assert.Greater(t, res.Score, 3.0)
}
func TestMADDetector_CalibrationStability(t *testing.T) {
// 1. Create a detector that auto-calibrates on 100 idle vectors.
detector := NewMADDetectorAutoCalibrate(3.5, 100)
now := time.Now()
// 2. Feed 99 perfectly idle vectors.
// They should all use "Identity" fallback and return low scores (or 0 if val is 0).
for i := 0; i < 99; i++ {
fv := types.FeatureVector{
Timestamp: now.Add(time.Duration(i) * time.Second),
NormalizedVector: []float64{0.0, 0.0},
}
res, err := detector.Score(fv)
assert.NoError(t, err)
assert.Equal(t, 0.0, res.Score)
assert.Contains(t, res.Method, "warmup")
}
// 3. Feed the 100th vector. This triggers calibration.
// Since all 100 vectors were 0, the learned medians will be 0 and mads will be 0.
fv100 := types.FeatureVector{
Timestamp: now.Add(100 * time.Second),
NormalizedVector: []float64{0.0, 0.0},
}
res100, err := detector.Score(fv100)
assert.NoError(t, err)
assert.Equal(t, 0.0, res100.Score)
// After this call, mads should be [0.0, 0.0] but clamped to 0.01 during Score.
// 4. Feed the 101st vector: A "normal" burst (e.g. 1.0 baseline IQR).
// Without the floor, this would be 1.0 / (1.48 * 0) -> infinity (clamped).
// With the floor (0.01), it should be 1.0 / (1.4826 * 0.01) ≈ 67.45.
fv101 := types.FeatureVector{
Timestamp: now.Add(101 * time.Second),
NormalizedVector: []float64{1.0, 0.0},
}
res101, err := detector.Score(fv101)
assert.NoError(t, err)
// Check that the score is contained.
// 1.0 / (1.4826 * 0.01) = 67.449
assert.InDelta(t, 67.449, res101.Score, 0.1)
assert.True(t, res101.IsAnomaly)
assert.Equal(t, "MAD", res101.Method) // No longer "warmup"
// 5. Test with a very small variance but not 0.
// Suppose learned MAD was 0.0001. Score for val=1.0 would be 1.0 / 0.000148... ≈ 6745.
// Our floor (0.01) should still clamp this to 67.45.
detector.mu.Lock()
detector.mads = []float64{0.0001, 0.0}
detector.medians = []float64{0.0, 0.0}
detector.mu.Unlock()
resSmall, err := detector.Score(fv101)
assert.NoError(t, err)
assert.InDelta(t, 67.449, resSmall.Score, 0.1)
}
func TestMADDetector_IdentityPrior(t *testing.T) {
detector := NewMADDetectorAutoCalibrate(3.5, 10)
// Feature vector with a deviation of 2.0 baseline IQR.
// Using identity prior (mad=1.0), the score should be:
// score = |2.0| / (1.4826 * 1.0) = 2.0 / 1.4826 ≈ 1.3489
// Wait, scoreIdentity uses 0.6745 directly: math.Abs(val) / 0.6745
// 2.0 / 0.6745 ≈ 2.965
fv := types.FeatureVector{
NormalizedVector: []float64{2.0},
}
res, _ := detector.Score(fv)
assert.InDelta(t, 2.965, res.Score, 0.1)
assert.False(t, res.IsAnomaly) // 2.96 < 3.5
// Feature vector with deviation of 3.0.
// score = 3.0 / 0.6745 ≈ 4.44
fv2 := types.FeatureVector{
NormalizedVector: []float64{3.0},
}
res2, _ := detector.Score(fv2)
assert.InDelta(t, 4.44, res2.Score, 0.1)
assert.True(t, res2.IsAnomaly)
assert.Contains(t, res2.Details, "identity prior")
}

173
internal/detect/rrcf.go Normal file
View file

@ -0,0 +1,173 @@
// Package detect provides anomaly detection algorithms and ensemble logic.
package detect
import (
"fmt"
"log"
"math"
"sync"
"codeberg.org/pata1704/guenther/pkg/types"
"codeberg.org/pata1704/rrcf"
)
// RRCFDetector wraps pkg/rrcf.Forest with the AnomalyDetector interface.
//
// Scoring strategy: score-then-insert (online streaming).
// Each call to Score:
// 1. Scores the point without inserting (ephemeral key thread-safe).
// 2. Inserts the point permanently so the forest stays fresh.
type RRCFDetector struct {
mu sync.Mutex
forest *rrcf.Forest
thresholdPct float64
numTrees int
treeSize int
warmup int
counter int
buf []types.FeatureVector
// Rolling score window for adaptive threshold calculation.
// Uses a FIFO ring buffer; only scores after warmupDiscard are included.
scoreWindow *ringBuffer
warmupDiscard int // number of scores to discard after forest initialisation
scored int // total scores seen (including discarded)
}
// NewRRCFDetector creates an RRCFDetector.
//
// - numTrees: number of trees in the forest (200 recommended).
// - treeSize: sliding-window capacity per tree (256 recommended).
// - warmup: vectors to buffer before first Score (pass 0 for immediate start).
// - thresholdPct: percentile of rolling score window used as threshold.
// E.g. 0.65 means: flag as anomaly if score > 65th percentile of recent scores.
//
// Internal defaults:
// - warmupDiscard = 10 (discard the first 10 scores; forest is not yet stable)
// - scoreWindowMax = 60
func NewRRCFDetector(numTrees, treeSize, warmup int, thresholdPct float64) *RRCFDetector {
return &RRCFDetector{
numTrees: numTrees,
treeSize: treeSize,
warmup: warmup,
thresholdPct: thresholdPct,
scoreWindow: newRingBuffer(60),
warmupDiscard: 10,
}
}
// Fit seeds the forest from a slice of FeatureVectors.
// It replaces any existing forest; the internal insert counter is reset.
func (d *RRCFDetector) Fit(vectors []types.FeatureVector) error {
if len(vectors) == 0 {
return nil
}
dim := len(vectors[0].NormalizedVector)
d.mu.Lock()
defer d.mu.Unlock()
d.forest = rrcf.NewForest(d.numTrees, dim, d.treeSize)
d.counter = 0
for _, v := range vectors {
if err := d.forest.Insert(v.NormalizedVector, d.counter); err != nil {
log.Printf("rrcf: fit insert: %v", err)
continue
}
d.counter++
}
log.Printf("rrcf: forest seeded with %d points (trees=%d, treeSize=%d)",
len(vectors), d.numTrees, d.treeSize)
return nil
}
// Score returns an AnomalyResult for vector.
// During the warmup phase (len(buf) < warmup) the vector is buffered and a
// zero-score result is returned.
func (d *RRCFDetector) Score(vector types.FeatureVector) (types.AnomalyResult, error) {
d.mu.Lock()
defer d.mu.Unlock()
// Lazy forest initialisation on the first Score call.
if d.forest == nil {
dim := len(vector.NormalizedVector)
d.forest = rrcf.NewForest(d.numTrees, dim, d.treeSize)
}
// Warmup buffering.
if d.warmup > 0 && len(d.buf) < d.warmup {
d.buf = append(d.buf, vector)
if len(d.buf) == d.warmup {
for _, v := range d.buf {
_ = d.forest.Insert(v.NormalizedVector, d.counter)
d.counter++
}
d.buf = nil
log.Printf("rrcf: warmup complete (%d vectors)", d.warmup)
}
return types.AnomalyResult{
Timestamp: vector.Timestamp,
Score: 0,
IsAnomaly: false,
Method: "RRCF",
}, nil
}
// Score via ephemeral insertion.
score, err := d.forest.Score(vector.NormalizedVector)
if err != nil {
return types.AnomalyResult{}, fmt.Errorf("rrcf: %w", err)
}
// Permanent streaming insert to keep the forest fresh.
if err := d.forest.Insert(vector.NormalizedVector, d.counter); err != nil {
log.Printf("rrcf: insert: %v", err)
}
d.counter++
d.scored++
// Discard the first warmupDiscard scores: the forest is still settling
// and scores are artificially high, which would anchor the threshold.
if d.scored <= d.warmupDiscard {
return types.AnomalyResult{
Timestamp: vector.Timestamp,
Score: score,
IsAnomaly: false,
Method: "RRCF",
}, nil
}
// Update rolling score window (ring buffer).
d.scoreWindow.push(score)
// Need at least 10 scores before making decisions.
isAnomaly := false
var threshold float64
if d.scoreWindow.size >= 10 {
threshold = d.rollingThreshold()
isAnomaly = score > threshold
}
confidence := 0.0
if threshold > 1e-9 {
confidence = math.Min(score/threshold, 1.0)
}
return types.AnomalyResult{
Timestamp: vector.Timestamp,
Score: score,
IsAnomaly: isAnomaly,
Confidence: confidence,
Method: "RRCF",
}, nil
}
// rollingThreshold returns the thresholdPct-quantile of the rolling score window.
// Caller must hold d.mu.
func (d *RRCFDetector) rollingThreshold() float64 {
return d.scoreWindow.quantileVal(d.thresholdPct)
}
// Update is a no-op for RRCF: insertion happens inside Score.
func (d *RRCFDetector) Update(_ types.FeatureVector) error { return nil }

299
internal/detect/scaling.go Normal file
View file

@ -0,0 +1,299 @@
package detect
import (
"log"
"sync"
"time"
"codeberg.org/pata1704/guenther/pkg/types"
)
// ScalingLevel represents the current detector complexity level.
type ScalingLevel int
const (
LevelNormal ScalingLevel = iota // SEAD Ensemble (full accuracy)
LevelHigh // COPOD (reduced complexity)
LevelCritical // MAD (minimal overhead)
)
// levelName maps ScalingLevel to a human-readable string for logging.
var levelName = map[ScalingLevel]string{
LevelNormal: "SEAD Ensemble (Normal)",
LevelHigh: "COPOD (High Load)",
LevelCritical: "MAD (Critical Load)",
}
// ── SwitchableDetector ───────────────────────────────────────────────────────
// SwitchableDetector wraps a SEADDetector and allows runtime switching to
// lighter-weight sub-detectors (COPOD, MAD) under high CPU load.
//
// State consistency guarantee: all base detectors are kept up-to-date
// regardless of which one is currently active. This ensures a clean
// transition back to SEAD without stale internal state.
//
// Update-deduplication contract:
//
// SEAD.Score() calls d.Score() on every base detector, which self-updates.
// → no separate Update() call needed; doing so would double-count.
// SEAD.Update() calls d.Update() on every base detector directly.
// → used here when we need to advance inactive detectors
// without scoring through SEAD.
//
// For LevelHigh / LevelCritical we call:
//
// s.ensemble.Update(vector) → advances MAD, RRCF variants via d.Update()
// COPOD.Update() = COPOD.update() (buffer append only)
// active.Score(vector) → scores + self-updates the active detector
// (COPOD.Score calls update internally again)
//
// This means COPOD receives one Update() + one self-update from Score() per tick.
// That is intentional: Update() appends to the sliding window buffer; Score()
// computes the copula and then appends the scored point (score-then-insert).
// The two operations are not idempotent and must both run for correct behaviour.
// RRCF and MAD are updated via SEAD.Update() only; their Score() methods are
// not called when inactive so they do not double-count.
type SwitchableDetector struct {
mu sync.RWMutex
ensemble *SEADDetector
copod AnomalyDetector // may be nil if COPOD is not configured
mad AnomalyDetector // may be nil if MAD is not configured
activeLevel ScalingLevel
}
// NewSwitchableDetector creates a SwitchableDetector backed by the given
// SEADDetector. COPOD and MAD sub-detectors are extracted from the ensemble
// for direct access during high-load switching.
//
// If a sub-detector is not present in the ensemble, the corresponding field
// is nil and Score() falls back to the ensemble for that level.
func NewSwitchableDetector(ensemble *SEADDetector) *SwitchableDetector {
return &SwitchableDetector{
ensemble: ensemble,
copod: ensemble.GetDetector("COPOD"),
mad: ensemble.GetDetector("MAD"),
activeLevel: LevelNormal,
}
}
// Fit trains all underlying detectors on the given baseline vectors.
func (s *SwitchableDetector) Fit(vectors []types.FeatureVector) error {
return s.ensemble.Fit(vectors)
}
// Update advances the internal state of all base detectors without scoring.
// Safe for concurrent use.
func (s *SwitchableDetector) Update(vector types.FeatureVector) error {
return s.ensemble.Update(vector)
}
// Score returns an AnomalyResult from the currently active detector.
//
// All inactive detectors are kept current via SEAD.Update() so that
// switching back to a heavier detector does not produce stale scores.
// Safe for concurrent use.
func (s *SwitchableDetector) Score(vector types.FeatureVector) (types.AnomalyResult, error) {
s.mu.RLock()
level := s.activeLevel
s.mu.RUnlock()
// LevelNormal: SEAD.Score() handles everything internally.
// It scores all base detectors (which self-update) and applies
// MWU weight adaptation. No separate Update() needed.
if level == LevelNormal {
return s.ensemble.Score(vector)
}
// LevelHigh / LevelCritical:
// 1. Advance all base detectors via SEAD.Update() so inactive detectors
// (MAD, RRCF variants for LevelHigh; RRCF, COPOD for LevelCritical)
// maintain current state. SEAD weight adaptation is NOT performed here
// because we are bypassing SEAD.Score().
if err := s.ensemble.Update(vector); err != nil {
// Non-fatal: log and continue. A single missed update is acceptable;
// the detector will resync on the next tick.
log.Printf("scaling: ensemble update error at level %s: %v", levelName[level], err)
}
// 2. Score via the active sub-detector.
// COPOD.Score() additionally self-updates (score-then-insert), which is
// correct and complementary to the Update() call above (see type doc).
// MAD.Update() internally calls Score(), so it is already current after
// the SEAD.Update() call; MAD.Score() here is pure scoring only.
switch level {
case LevelHigh:
if s.copod == nil {
log.Printf("scaling: COPOD unavailable at LevelHigh, falling back to ensemble")
return s.ensemble.Score(vector)
}
res, err := s.copod.Score(vector)
if err != nil {
return res, err
}
res.Method = "COPOD (High Load)"
return res, nil
case LevelCritical:
if s.mad == nil {
log.Printf("scaling: MAD unavailable at LevelCritical, falling back to ensemble")
return s.ensemble.Score(vector)
}
res, err := s.mad.Score(vector)
if err != nil {
return res, err
}
res.Method = "MAD (Critical Load)"
return res, nil
default:
return s.ensemble.Score(vector)
}
}
// Switch atomically changes the active detection level.
// It is a no-op if the requested level equals the current level.
// Safe for concurrent use.
func (s *SwitchableDetector) Switch(level ScalingLevel) {
s.mu.Lock()
defer s.mu.Unlock()
if s.activeLevel == level {
return
}
log.Printf("[SCALING] %s → %s", levelName[s.activeLevel], levelName[level])
s.activeLevel = level
}
// ── ScalingController ────────────────────────────────────────────────────────
// ScalingController monitors CPU load and drives a SwitchableDetector through
// its scaling levels (Normal → High → Critical and back).
//
// Level transitions follow a two-phase commit pattern:
//
// 1. A CPU measurement moves the desired level to a "pending" state.
// 2. Only after the pending level has been stable for the configured
// duration is Switch() called on the detector.
//
// This prevents rapid oscillation under bursty workloads.
//
// Hysteresis rules (in the dead-band between downThres and highThres):
//
// Critical → High (one step down, not straight to Normal)
// High → High (stays until CPU drops below downThres)
// Normal → Normal
//
// ScalingController is not safe for concurrent use. ObserveCPU must be
// called from a single goroutine (the DetectionLayer's processing loop).
type ScalingController struct {
detector *SwitchableDetector
// Thresholds (CPU percent, 0100)
highThres float64
critThres float64
downThres float64
// Required stable duration before a level transition is committed.
highDur time.Duration
critDur time.Duration
downDur time.Duration
// currentLevel is the level that has been committed to the detector.
currentLevel ScalingLevel
// pendingLevel is the desired level based on recent CPU measurements.
// It must remain stable for the corresponding duration before becoming current.
pendingLevel ScalingLevel
// pendingStart is the time at which pendingLevel last changed.
// The pending level is committed when time.Since(pendingStart) >= required duration.
pendingStart time.Time
}
// NewScalingController constructs a ScalingController.
// Duration arguments are in seconds (float64 to match YAML config values).
func NewScalingController(
detector *SwitchableDetector,
highThres, critThres, downThres float64,
highDurSec, critDurSec, downDurSec float64,
) *ScalingController {
return &ScalingController{
detector: detector,
highThres: highThres,
critThres: critThres,
downThres: downThres,
highDur: time.Duration(highDurSec * float64(time.Second)),
critDur: time.Duration(critDurSec * float64(time.Second)),
downDur: time.Duration(downDurSec * float64(time.Second)),
currentLevel: LevelNormal,
pendingLevel: LevelNormal,
pendingStart: time.Now(), // explicit init avoids zero-time edge case
}
}
// ObserveCPU processes a single CPU measurement and, if warranted, triggers
// a level switch on the underlying SwitchableDetector.
//
// Must be called from a single goroutine only (not safe for concurrent use).
func (c *ScalingController) ObserveCPU(cpuPercent float64) {
now := time.Now()
desired := c.desiredLevel(cpuPercent)
// Phase 1: desired level changed → restart the stability timer.
if desired != c.pendingLevel {
c.pendingLevel = desired
c.pendingStart = now
return
}
// Phase 2: desired level has been stable check if duration is met.
if now.Sub(c.pendingStart) < c.durationFor(desired) {
return
}
if desired != c.currentLevel {
c.currentLevel = desired
c.detector.Switch(desired)
}
c.pendingStart = now
}
// desiredLevel computes the target ScalingLevel for a given CPU measurement,
// applying hysteresis in the dead-band between downThres and highThres.
func (c *ScalingController) desiredLevel(cpuPercent float64) ScalingLevel {
switch {
case cpuPercent > c.critThres:
return LevelCritical
case cpuPercent > c.highThres:
return LevelHigh
case cpuPercent < c.downThres:
return LevelNormal
default:
// Dead-band: degrade at most one step to avoid jumping straight
// from Critical to Normal on a brief CPU dip.
switch c.currentLevel {
case LevelCritical:
return LevelHigh
case LevelHigh:
return LevelHigh
default:
return LevelNormal
}
}
}
// durationFor returns the required stable duration for a given target level.
func (c *ScalingController) durationFor(level ScalingLevel) time.Duration {
switch level {
case LevelCritical:
return c.critDur
case LevelHigh:
return c.highDur
default:
return c.downDur
}
}

507
internal/detect/sead.go Normal file
View file

@ -0,0 +1,507 @@
// Package detect provides anomaly detection algorithms and ensemble logic.
package detect
// sead.go SEAD: Unsupervised Ensemble of Streaming Anomaly Detectors
//
// Implementation of Algorithm 1 from:
// Shah et al. "SEAD: Unsupervised Ensemble of Streaming Anomaly Detectors"
// ICML 2025, Amazon Science.
//
// Core algorithm (Multiplicative Weights Update / FTRL with KL-divergence):
//
// 1. For each incoming feature vector x_t:
// a. Score every base detector: s̃_i(t) = A_i(x_t)
// b. Normalise to [0,1] via streaming quantile: s_i(t) = Q(s̃_i(t); history_i)
// c. Compute softmax weights: p_i(t) = exp(w_i) / Σ exp(w_j)
// d. Output combined score: S_t = Σ p_i(t) · s_i(t)
// e. Update weights: w_i(t+1) = w_i(t) η · ∂L_t/∂w_i
// where L_t = S_t + λ · KL(p || π)
// 2. Update each base detector: A_i(t+1) ← Update(A_i(t), x_t)
//
// Streaming quantiles are approximated via a fixed-capacity sorted circular
// buffer (lightweight t-digest substitute). For N=4 detectors at 1 Hz this
// is negligible memory and CPU overhead.
//
// SEAD runs parallel to the existing AVG/MAX/MEDIAN ensemble; it is selected
// by setting detector.ensemble.method = "sead" in the config.
import (
"fmt"
"math"
"sort"
"strings"
"sync"
"codeberg.org/pata1704/guenther/pkg/types"
)
// ─── FIFO Ring Buffer ─────────────────────────────────────────────────────────
// ringBuffer is a fixed-capacity circular buffer with true FIFO eviction.
//
// Memory: O(cap · 8 bytes). For cap=500 this is 4 KB per detector
type ringBuffer struct {
data []float64
head int // index of the next write position
size int // current number of elements
cap int
}
func newRingBuffer(capacity int) *ringBuffer {
if capacity < 10 {
capacity = 10
}
return &ringBuffer{
data: make([]float64, capacity),
cap: capacity,
}
}
// push inserts v, overwriting the oldest entry when the buffer is full.
// Returns the empirical quantile rank of v within the current window ∈ [0,1].
func (r *ringBuffer) push(v float64) float64 {
r.data[r.head] = v
r.head = (r.head + 1) % r.cap
if r.size < r.cap {
r.size++
}
n := r.size
if n <= 1 {
return 0.5
}
sorted := make([]float64, n)
for i := range n {
sorted[i] = r.data[(r.head-n+i+r.cap)%r.cap]
}
sort.Float64s(sorted)
rank := sort.SearchFloat64s(sorted, v)
return float64(rank) / float64(n-1)
}
// quantileVal returns the value at quantile p ∈ [0,1] without modifying the buffer.
func (r *ringBuffer) quantileVal(p float64) float64 {
n := r.size
if n == 0 {
return 0
}
sorted := make([]float64, n)
for i := range n {
sorted[i] = r.data[(r.head-n+i+r.cap)%r.cap]
}
sort.Float64s(sorted)
idx := int(p * float64(n-1))
if idx >= n {
idx = n - 1
}
return sorted[idx]
}
// streamQuantile is an alias kept for API compatibility.
// New code should use ringBuffer directly.
type streamQuantile = ringBuffer
func newStreamQuantile(capacity int) *ringBuffer {
return newRingBuffer(capacity)
}
// ─── SEADDetector ─────────────────────────────────────────────────────────────
// SEADDetector implements the SEAD algorithm: an unsupervised online ensemble
// that adaptively weights N base anomaly detectors using Multiplicative Weights
// Update (MWU / FTRL with KL-divergence regulariser).
//
// Key properties:
// - Fully unsupervised: no anomaly labels required.
// - O(1) per time step: computational cost does not grow with stream length.
// - Adaptive: detector weights shift as data distribution changes.
// - Score-scale agnostic: all base scores are quantile-normalised to [0,1]
// before aggregation, preventing any single detector from dominating due
// to score magnitude differences.
//
// Configuration:
// - eta (η): MWU learning rate. Larger → faster adaptation, more noise.
// Recommended range: [0.05, 0.3]. Default: 0.1.
// - lambda (λ): KL-divergence regularisation strength. 0 = pure MWU (uniform
// prior). Positive values pull weights toward π (uniform). Default: 0.01.
// - quantileWindow: number of past scores retained per detector for quantile
// normalisation. Default: 300.
// - contamination: expected anomaly fraction used to set the decision
// threshold as quantile(combinedHistory, 1-contamination). Default: 0.15.
// - minDataPoints: minimum scored windows before any anomaly is flagged.
type SEADDetector struct {
detectors []AnomalyDetector // N base detectors (MAD, RRCF, COPOD, IForest)
names []string // human-readable name per detector
// MWU state
weights []float64 // w_i (log-space, unconstrained)
eta float64 // learning rate η
lambda float64 // KL regularisation strength λ
prior []float64 // π uniform by default
// Streaming quantile per detector
quantiles []*streamQuantile
// Combined score history for threshold computation
// Uses a FIFO ring buffer (capacity: historySize) so every score lives
// exactly historySize time steps, regardless of its magnitude.
contamination float64
combinedHistory *ringBuffer // FIFO ring buffer, capacity=1000
minDataPoints int
mu sync.Mutex
}
// SEADConfig holds all tunable parameters for the SEAD ensemble.
type SEADConfig struct {
// Eta is the MWU learning rate η.
// Higher values react faster to distribution shifts but are noisier.
// Recommended: 0.050.20. Default: 0.10.
Eta float64
// Lambda is the KL-divergence regularisation strength.
// 0 = pure MWU (no penalty for deviation from prior).
// Positive values add stability; use 0.010.05.
Lambda float64
// QuantileWindow is the number of past scores retained per detector.
// Larger → more stable quantiles but slower adaptation.
// Default: 300.
QuantileWindow int
// Contamination is the expected anomaly fraction ∈ [0, 0.5).
// Sets the decision threshold at quantile(1-contamination) of combined history.
// Default: 0.15.
Contamination float64
// MinDataPoints is the cold-start guard: anomalies are not flagged until
// at least this many windows have been scored. Default: 20.
MinDataPoints int
}
// DefaultSEADConfig returns sensible defaults for the SEAD ensemble.
func DefaultSEADConfig() SEADConfig {
return SEADConfig{
Eta: 0.10,
Lambda: 0.01,
QuantileWindow: 300,
Contamination: 0.15,
MinDataPoints: 20,
}
}
// NewSEADDetector constructs a SEAD ensemble from N base detectors.
//
// - detectors: slice of base AnomalyDetector implementations. Must be ≥ 1.
// - names: human-readable labels for each detector (used in Details field).
// - cfg: SEAD tuning parameters (use DefaultSEADConfig() for a safe start).
func NewSEADDetector(
detectors []AnomalyDetector,
names []string,
cfg SEADConfig,
) (*SEADDetector, error) {
n := len(detectors)
if n == 0 {
return nil, fmt.Errorf("sead: at least one base detector required")
}
if len(names) != n {
return nil, fmt.Errorf("sead: names length %d must match detectors length %d", len(names), n)
}
if cfg.Eta <= 0 {
cfg.Eta = 0.10
}
if cfg.QuantileWindow <= 0 {
cfg.QuantileWindow = 300
}
if cfg.Contamination <= 0 || cfg.Contamination >= 0.5 {
cfg.Contamination = 0.15
}
if cfg.MinDataPoints <= 0 {
cfg.MinDataPoints = 20
}
// Uniform prior π = 1/N for all detectors.
prior := make([]float64, n)
for i := range prior {
prior[i] = 1.0 / float64(n)
}
// Initialise weights uniformly in log-space: w_i = 0 → softmax = 1/N.
weights := make([]float64, n)
quantiles := make([]*streamQuantile, n)
for i := range quantiles {
quantiles[i] = newStreamQuantile(cfg.QuantileWindow)
}
return &SEADDetector{
detectors: detectors,
names: names,
weights: weights,
eta: cfg.Eta,
lambda: cfg.Lambda,
prior: prior,
quantiles: quantiles,
contamination: cfg.Contamination,
combinedHistory: newRingBuffer(1000),
minDataPoints: cfg.MinDataPoints,
}, nil
}
// Fit seeds all base detectors from labelled-normal vectors.
// SEAD itself has no training phase; only the base detectors are fitted.
func (s *SEADDetector) Fit(vectors []types.FeatureVector) error {
for i, d := range s.detectors {
if err := d.Fit(vectors); err != nil {
return fmt.Errorf("sead: fit detector %q: %w", s.names[i], err)
}
}
return nil
}
// Update propagates the feature vector to all base detectors.
func (s *SEADDetector) Update(vector types.FeatureVector) error {
for i, d := range s.detectors {
if err := d.Update(vector); err != nil {
return fmt.Errorf("sead: update detector %q: %w", s.names[i], err)
}
}
return nil
}
// Score implements Algorithm 1 from the SEAD paper.
//
// Steps:
// 1. Score each base detector → raw scores s̃_i.
// Each detector also self-updates its internal state (RRCF inserts
// the point into the forest; COPOD appends to its copula buffer;
// IForest adds to its retraining buffer; MAD buffers for calibration).
// 2. Quantile-normalise each s̃_i to ŝ_i ∈ [0,1] via streaming window.
// 3. Compute softmax weights p_i = exp(w_i) / Σ exp(w_j).
// 4. Combined score S = Σ p_i · ŝ_i.
// 5. Update weights: w_i -= η · ∂L/∂w_i
// where L = S + λ · KL(p || π).
// 6. Threshold S against rolling (1-contamination)-quantile of S history.
func (s *SEADDetector) Score(vector types.FeatureVector) (types.AnomalyResult, error) {
n := len(s.detectors)
// ── Step 1: Score all base detectors ──────────────────────────────────────
// Each detector's Score method is responsible for self-updating (RRCF inserts
// into its forest; COPOD appends to its copula buffer; etc.). We do NOT call
// d.Update separately here to avoid double-counting in detectors that already
// self-update inside Score.
rawScores := make([]float64, n)
anomalyFlags := make([]bool, n)
for i, d := range s.detectors {
res, err := d.Score(vector)
if err != nil {
// Degrade gracefully: treat failed detector as neutral (score=0.5).
rawScores[i] = 0.5
} else {
rawScores[i] = res.Score
anomalyFlags[i] = res.IsAnomaly
}
}
s.mu.Lock()
defer s.mu.Unlock()
// ── Step 2: Quantile-normalise scores to [0,1] ────────────────────────────
normScores := make([]float64, n)
for i, raw := range rawScores {
normScores[i] = s.quantiles[i].push(raw)
}
// ── Step 3: Softmax weights ───────────────────────────────────────────────
p := softmax(s.weights)
// ── Step 4: Combined score ────────────────────────────────────────────────
combined := 0.0
for i := range p {
combined += p[i] * normScores[i]
}
// ── Step 5: Weight update (MWU gradient step) ─────────────────────────────
// Loss L(w) = combined(w) + λ · KL(softmax(w) || π)
// ∂L/∂w_i = p_i · (ŝ_i - combined) + λ · (p_i - π_i)
//
// This is the closed-form gradient for softmax + weighted sum + KL penalty.
for i := range s.weights {
gradCombined := p[i] * (normScores[i] - combined)
gradKL := s.lambda * (p[i] - s.prior[i])
s.weights[i] -= s.eta * (gradCombined + gradKL)
}
// ── Step 6: Threshold decision ────────────────────────────────────────────
// Use FIFO ring buffer: oldest score is evicted automatically after
// 1000 time steps, giving the threshold a finite, sliding memory.
s.combinedHistory.push(combined)
threshold := s.combinedHistory.quantileVal(1.0 - s.contamination)
isAnomaly := s.combinedHistory.size > s.minDataPoints && combined > threshold
confidence := 0.0
if threshold > 1e-9 {
confidence = math.Min(combined/threshold, 1.0)
}
return types.AnomalyResult{
Timestamp: vector.Timestamp,
Score: combined,
IsAnomaly: isAnomaly,
Confidence: confidence,
Method: "SEAD",
Details: s.detailString(p, normScores, anomalyFlags),
}, nil
}
// GetDetector returns a base detector by name. Returns nil if not found.
func (s *SEADDetector) GetDetector(name string) AnomalyDetector {
s.mu.Lock()
defer s.mu.Unlock()
for i, n := range s.names {
if n == name {
return s.detectors[i]
}
}
return nil
}
// Weights returns a copy of the current softmax-normalised detector weights.
// Useful for logging and diagnostics. Thread-safe.
func (s *SEADDetector) Weights() []float64 {
s.mu.Lock()
defer s.mu.Unlock()
return softmax(s.weights)
}
// WeightSummary returns a human-readable string of detector weights.
func (s *SEADDetector) WeightSummary() string {
w := s.Weights()
var sb strings.Builder
for i, name := range s.names {
if i > 0 {
sb.WriteString(" | ")
}
sb.WriteString(fmt.Sprintf("%s=%.3f", name, w[i]))
}
return sb.String()
}
// detailString builds a diagnostic annotation for AnomalyResult.Details.
// Caller must hold s.mu.
func (s *SEADDetector) detailString(p, normScores []float64, flags []bool) string {
var parts []string
for i, name := range s.names {
flag := ""
if flags[i] {
flag = "!"
}
parts = append(parts, fmt.Sprintf("%s%s:w=%.2f,s=%.2f", name, flag, p[i], normScores[i]))
}
return strings.Join(parts, " ")
}
// ─── Math helpers ─────────────────────────────────────────────────────────────
// softmax returns exp(w_i) / Σ exp(w_j) with numerical stability (max subtraction).
func softmax(w []float64) []float64 {
maxW := w[0]
for _, v := range w[1:] {
if v > maxW {
maxW = v
}
}
out := make([]float64, len(w))
var sum float64
for i, v := range w {
out[i] = math.Exp(v - maxW)
sum += out[i]
}
for i := range out {
out[i] /= sum
}
return out
}
// ─── Factory helpers ──────────────────────────────────────────────────────────
// NewSEADWithAllDetectors constructs a SEAD ensemble from six base detectors:
// MAD, RRCF-fast, RRCF-mid, RRCF-slow, COPOD, IsolationForest.
//
// SEAD's MWU weight-update naturally up-weights the variant that consistently
// separates anomalies from normal windows, and adapts when the stream
// distribution shifts (e.g. time-of-day effects).
//
// MAD auto-calibration: the MADDetector buffers the first madCalibSize
// NormalizedVectors, derives per-feature median and MAD, and starts scoring
// once calibration is complete. Calibration requires no external tooling.
// SEAD down-weights MAD automatically during the warmup phase.
func NewSEADWithAllDetectors(
copodBufferSize int, copodThreshold float64,
rrcfVariants RRCFVariantsConfig,
madThreshold float64, madCalibSize int,
seadCfg SEADConfig,
) (*SEADDetector, error) {
if rrcfVariants.Fast.NumTrees == 0 {
rrcfVariants.Fast.NumTrees = 50
}
if rrcfVariants.Fast.TreeSize == 0 {
rrcfVariants.Fast.TreeSize = 32
}
if rrcfVariants.Fast.ThresholdPercentile == 0 {
rrcfVariants.Fast.ThresholdPercentile = 0.85
}
if rrcfVariants.Mid.NumTrees == 0 {
rrcfVariants.Mid.NumTrees = 150
}
if rrcfVariants.Mid.TreeSize == 0 {
rrcfVariants.Mid.TreeSize = 64
}
if rrcfVariants.Mid.ThresholdPercentile == 0 {
rrcfVariants.Mid.ThresholdPercentile = 0.85
}
if rrcfVariants.Slow.NumTrees == 0 {
rrcfVariants.Slow.NumTrees = 200
}
if rrcfVariants.Slow.TreeSize == 0 {
rrcfVariants.Slow.TreeSize = 128
}
if rrcfVariants.Slow.ThresholdPercentile == 0 {
rrcfVariants.Slow.ThresholdPercentile = 0.85
}
// ── Construct base detectors ──────────────────────────────────────────────
copod, err := NewCOPODDetector(copodBufferSize, copodThreshold)
if err != nil {
return nil, fmt.Errorf("sead: copod: %w", err)
}
rrcfFast := NewRRCFDetector(
rrcfVariants.Fast.NumTrees, rrcfVariants.Fast.TreeSize,
0, rrcfVariants.Fast.ThresholdPercentile,
)
rrcfMid := NewRRCFDetector(
rrcfVariants.Mid.NumTrees, rrcfVariants.Mid.TreeSize,
0, rrcfVariants.Mid.ThresholdPercentile,
)
rrcfSlow := NewRRCFDetector(
rrcfVariants.Slow.NumTrees, rrcfVariants.Slow.TreeSize,
0, rrcfVariants.Slow.ThresholdPercentile,
)
if madCalibSize <= 0 {
madCalibSize = 100
}
mad := NewMADDetectorAutoCalibrate(madThreshold, madCalibSize)
return NewSEADDetector(
[]AnomalyDetector{mad, rrcfFast, rrcfMid, rrcfSlow, copod},
[]string{"MAD", "RRCF-fast", "RRCF-mid", "RRCF-slow", "COPOD"},
seadCfg,
)
}

View file

@ -0,0 +1,61 @@
package detect
import (
"testing"
"github.com/stretchr/testify/assert"
)
func TestRingBuffer_FIFO(t *testing.T) {
// 1. Initialize with capacity 10
rb := newRingBuffer(10)
assert.Equal(t, 10, rb.cap)
assert.Equal(t, 0, rb.size)
// 2. Fill it up
for i := 1; i <= 10; i++ {
rb.push(float64(i))
}
assert.Equal(t, 10, rb.size)
// head should be at 0 after 10 pushes
assert.Equal(t, 0, rb.head)
// 3. Verify quantile (sorted view)
// sorted: [1 2 3 4 5 6 7 8 9 10]
// quantile 0.5 (median) of 10 items: index int(0.5 * 9) = 4 -> value 5
assert.Equal(t, 5.0, rb.quantileVal(0.5))
// 4. Push one more to trigger FIFO eviction
// Should evict "1" (the oldest)
rb.push(11.0)
assert.Equal(t, 10, rb.size)
assert.Equal(t, 1, rb.head)
// 5. Verify the oldest (1.0) is gone and 11.0 is present
// sorted: [2 3 4 5 6 7 8 9 10 11]
// idx = int(0.4 * 9) = 3 -> value at index 3 is 5.0
assert.Equal(t, 5.0, rb.quantileVal(0.4))
// let's be precise: idx = int(p * 9)
// p=0 -> idx 0 (2.0)
// p=1 -> idx 9 (11.0)
assert.Equal(t, 2.0, rb.quantileVal(0.0))
assert.Equal(t, 11.0, rb.quantileVal(1.0))
}
func TestRingBuffer_Rank(t *testing.T) {
rb := newRingBuffer(5)
// Rank is float64(rank) / float64(n-1)
assert.Equal(t, 0.5, rb.push(10.0)) // n=1 -> 0.5
assert.Equal(t, 1.0, rb.push(20.0)) // n=2, sorted=[10, 20], search(20)->1. 1/(2-1)=1.0
assert.Equal(t, 0.0, rb.push(5.0)) // n=3, sorted=[5, 10, 20], search(5)->0. 0/2=0.0
// n=4, sorted=[5 10 10 20], search(10) -> idx 1. 1/(4-1) = 0.333...
assert.InDelta(t, 0.3333333333333333, rb.push(10.0), 1e-9)
rb = newRingBuffer(4)
rb.push(1.0)
rb.push(3.0)
rank := rb.push(2.0) // n=3, sorted=[1, 2, 3], search(2)->idx 1. 1/(3-1)=0.5
assert.Equal(t, 0.5, rank)
}

View file

@ -0,0 +1,32 @@
// Package drain3 provides log stripping via regex-based masking templates which
// sits in front of Drain3 template mining.
package drain3
import (
"codeberg.org/pata1704/guenther/internal/config"
)
// ApplyMasking applies all MaskingPatterns sequentially to line.
//
// For each pattern with a non-empty Name, capture group 1 of the regex is
// stored in params before the match is replaced with mp.Replace.
// Patterns without a Name only mask; they never write to params.
//
// All patterns are pre-compiled via config.Compile at startup;
// no compilation happens in this hot-path function.
func ApplyMasking(line string, patterns []config.MaskingPattern) (masked string, params map[string]string) {
params = make(map[string]string, len(patterns))
masked = line
for _, mp := range patterns {
if mp.Re == nil {
continue
}
if mp.Name != "" {
if m := mp.Re.FindStringSubmatch(masked); len(m) > 1 {
params[mp.Name] = m[1]
}
}
masked = mp.Re.ReplaceAllString(masked, mp.Replace)
}
return masked, params
}

111
internal/health/monitor.go Normal file
View file

@ -0,0 +1,111 @@
package health
import (
"context"
"encoding/json"
"log"
"sync"
"time"
"codeberg.org/pata1704/guenther/pkg/types"
)
// HealthMonitor collects StageHealth snapshots from pipeline stages and
// periodically prints a JSON report to the standard logger.
//
// Stages write to the channel returned by Chan(). The channel is buffered
// (capacity 100) so health updates never block the sending stage.
//
// The channel is intentionally private (accessed via Chan()) so that callers
// cannot close it from outside and cannot see the internal buffer size.
type HealthMonitor struct {
healthChan chan types.StageHealth
mu sync.Mutex
stages map[string]*types.StageHealth
wg sync.WaitGroup
}
// NewHealthMonitor allocates a HealthMonitor. Call Start to begin processing.
func NewHealthMonitor() *HealthMonitor {
return &HealthMonitor{
healthChan: make(chan types.StageHealth, 100),
stages: make(map[string]*types.StageHealth),
}
}
// Chan returns the write-only channel that pipeline stages use to submit
// health updates. The channel remains open for the lifetime of the monitor.
func (m *HealthMonitor) Chan() chan<- types.StageHealth {
return m.healthChan
}
// Start begins the health collection loop and periodic reporting.
// interval controls how often the report is printed (typically 5 s).
func (m *HealthMonitor) Start(ctx context.Context, interval time.Duration) {
ticker := time.NewTicker(interval)
m.wg.Go(func() {
defer ticker.Stop()
for {
select {
case h := <-m.healthChan:
m.mu.Lock()
// Shallow copy so the map owns the value.
snap := h
m.stages[h.StageName] = &snap
m.mu.Unlock()
case <-ticker.C:
m.printReport()
case <-ctx.Done():
// Drain remaining updates before exiting.
for {
select {
case h := <-m.healthChan:
m.mu.Lock()
snap := h
m.stages[h.StageName] = &snap
m.mu.Unlock()
default:
return
}
}
}
}
})
}
// Wait waits for the health monitor goroutine to exit after context cancellation.
func (m *HealthMonitor) Wait() {
m.wg.Wait()
}
// Snapshot returns a point-in-time copy of all stage health records.
// Useful for tests and metrics endpoints.
func (m *HealthMonitor) Snapshot() map[string]types.StageHealth {
m.mu.Lock()
defer m.mu.Unlock()
out := make(map[string]types.StageHealth, len(m.stages))
for k, v := range m.stages {
out[k] = *v
}
return out
}
func (m *HealthMonitor) printReport() {
m.mu.Lock()
defer m.mu.Unlock()
log.Println("── Pipeline Health ──────────────────────────────")
for _, h := range m.stages {
b, err := json.Marshal(h)
if err != nil {
log.Printf("[%s] marshal error: %v", h.StageName, err)
continue
}
log.Printf("[%s] %s", h.StageName, b)
}
log.Println("─────────────────────────────────────────────────")
}

1091
internal/transform/engine.go Normal file

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,106 @@
package transform
import (
"context"
"testing"
"time"
"codeberg.org/pata1704/guenther/internal/config"
"codeberg.org/pata1704/guenther/pkg/types"
"github.com/stretchr/testify/assert"
)
func TestTransformEngine_Fusion(t *testing.T) {
logChan := make(chan types.LogEvent, 100)
metricChan := make(chan types.MetricSnapshot, 100)
serviceStatusChan := make(chan types.ServiceStatus, 100)
featureChan := make(chan types.FeatureVector, 100)
healthChan := make(chan types.StageHealth, 10)
cfg := &config.Config{}
cfg.Transformation.WindowSize = 1 * time.Second
cfg.Transformation.DbPath = ":memory:"
engine, err := NewTransformEngine(cfg, logChan, metricChan, serviceStatusChan, featureChan, healthChan)
assert.NoError(t, err)
baseTime := time.Date(2026, 1, 1, 12, 0, 0, 0, time.Local)
// 1. Send data for first window
metricChan <- types.MetricSnapshot{
Timestamp: baseTime,
CPUPercent: 50.0,
MemoryUsedMB: 1000,
MemoryDirtyMB: 100,
NetworkInMBps: 10.0,
NetworkOutMBps: 20.0,
TCPRetransPerS: 5,
NetPacketsInPerS: 100,
NetPacketsOutPerS: 200,
}
// 2. Start engine and wait for first window
ctx, cancel := context.WithCancel(context.Background())
engine.Start(ctx)
defer func() {
cancel()
engine.Wait()
}()
select {
case fv := <-featureChan:
assert.Equal(t, 50.0, fv.AvgCPUPercent)
// Deltas are absolute value on first window because tracker starts at 0
assert.Equal(t, 10.0, fv.DeltaNetIn)
case <-time.After(2 * time.Second):
t.Fatal("Timeout waiting for first FeatureVector")
}
// 3. Send data for second window (triggers deltas)
secondTime := baseTime.Add(cfg.Transformation.WindowSize)
metricChan <- types.MetricSnapshot{
Timestamp: secondTime,
CPUPercent: 60.0,
MemoryUsedMB: 1000,
MemoryDirtyMB: 200,
NetworkInMBps: 15.0, // DeltaNetIn = 15.0 - 10.0 = 5.0
NetworkOutMBps: 20.0,
TCPRetransPerS: 10, // DeltaTCPRetrans = 10.0 - 5.0 = 5.0
NetPacketsInPerS: 150,
NetPacketsOutPerS: 200,
}
select {
case fv := <-featureChan:
// Check original logic
assert.Equal(t, 60.0, fv.AvgCPUPercent)
// Check new delta features
assert.Equal(t, 5.0, fv.DeltaNetIn)
assert.Equal(t, 5.0, fv.DeltaTCPRetrans)
// Check ratio features
// MemPressure = dirty / (used + 1) = 200/1001
expectedPressure := 200.0 / 1001.0
assert.InDelta(t, expectedPressure, fv.MemPressure, 1e-9)
// NetAsymmetry = in / (out + 1e-3) = 15/20.001
expectedAsym := 15.0 / 20.001
assert.InDelta(t, expectedAsym, fv.NetAsymmetry, 1e-9)
// Check NormalizedVector length (should be 45 base + params)
assert.GreaterOrEqual(t, len(fv.NormalizedVector), 45)
// Verify slots 39-44 (Engineered Features tail)
nv := fv.NormalizedVector
assert.Equal(t, 5.0, nv[39]) // DeltaNetIn
assert.Equal(t, 5.0, nv[40]) // DeltaTCPRetrans
// TcpRollStd and NetRollStd will have values (even if just 2 pts)
assert.Greater(t, nv[41], 0.0) // TcpRollStd (10 and 5)
assert.Equal(t, 0.0, nv[42]) // NetRollStd (20 and 20 -> std=0)
assert.InDelta(t, expectedPressure, nv[43], 1e-9) // MemPressure
assert.InDelta(t, expectedAsym, nv[44], 1e-9) // NetAsymmetry
case <-time.After(2 * time.Second):
t.Fatal("Timeout waiting for second FeatureVector")
}
}

View file

@ -0,0 +1,230 @@
// Package transform contains the DuckDB-backed Tumbling Window Engine.
package transform
import (
"fmt"
"strings"
"codeberg.org/pata1704/guenther/internal/config"
)
// they are derived from already-scaled inputs or are ratio/delta features).
var scalerFeatureNames = []string{
// CPU (3)
"avg_cpu", "max_cpu", "std_cpu",
// System/Kernel (7)
"avg_iowait", "std_iowait", "avg_softirq", "avg_ctx_switches", "avg_interrupts", "avg_softnet_dropped", "avg_softnet_squeeze",
// Network (8)
"avg_net_in", "std_net_in", "avg_net_out", "std_net_out", "sum_tcp_retrans", "sum_tcp_fast_retrans", "sum_tcp_timeouts", "avg_net_drops",
// Disk (4)
"avg_disk_read", "avg_disk_write", "avg_disk_io_ticks", "std_disk_io_ticks",
// Log (2)
"error_count", "severity_score",
}
// ScalerFeatureNames returns the ordered list of feature names stored in
// scaler_params.
func ScalerFeatureNames() []string { return scalerFeatureNames }
func BuildScalerParamsTable() string {
return `CREATE TABLE IF NOT EXISTS scaler_params (
feature_name VARCHAR PRIMARY KEY,
mean DOUBLE NOT NULL,
std DOUBLE NOT NULL
)`
}
func BuildFitScalerQuery() string {
return `
INSERT OR REPLACE INTO scaler_params (feature_name, mean, std)
WITH stats AS (
SELECT
-- CPU
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY cpu_percent) AS m_avg_cpu,
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY cpu_percent) AS m_max_cpu, -- Approximation
0.0 AS m_std_cpu, -- Baseline std is often 0 or low
-- System
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY cpu_iowait_percent) AS m_avg_iowait,
0.0 AS m_std_iowait,
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY cpu_softirq_percent) AS m_avg_softirq,
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY context_switches_s) AS m_avg_ctx_switches,
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY interrupts_s) AS m_avg_interrupts,
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY softnet_dropped_s) AS m_avg_softnet_dropped,
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY softnet_time_squeeze_s) AS m_avg_softnet_squeeze,
-- Network
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY net_in_mbps) AS m_avg_net_in,
0.0 AS m_std_net_in,
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY net_out_mbps) AS m_avg_net_out,
0.0 AS m_std_net_out,
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY tcp_retrans_s) AS m_sum_tcp_retrans,
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY tcp_fast_retrans_s) AS m_sum_tcp_fast_retrans,
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY tcp_timeouts_s) AS m_sum_tcp_timeouts,
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY network_drops_s) AS m_avg_net_drops,
-- Disk
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY disk_read_mbps) AS m_avg_disk_read,
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY disk_write_mbps) AS m_avg_disk_write,
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY disk_io_ticks_s) AS m_avg_disk_io_ticks,
0.0 AS m_std_disk_io_ticks,
-- IQRs for scaling
(PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY cpu_percent) - PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY cpu_percent)) AS s_avg_cpu,
(PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY cpu_iowait_percent) - PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY cpu_iowait_percent)) AS s_avg_iowait,
(PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY net_in_mbps) - PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY net_in_mbps)) AS s_avg_net_in,
(PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY net_out_mbps) - PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY net_out_mbps)) AS s_avg_net_out,
(PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY disk_io_ticks_s) - PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY disk_io_ticks_s)) AS s_avg_disk_io_ticks
FROM raw_metrics
WHERE timestamp >= $1 AND timestamp < $2
)
SELECT feature_name, mean, std FROM (
SELECT 'avg_cpu' AS feature_name, s.m_avg_cpu AS mean, GREATEST(s.s_avg_cpu, 1e-9) AS std FROM stats s UNION ALL
SELECT 'max_cpu', s.m_max_cpu, GREATEST(s.s_avg_cpu, 1e-9) FROM stats s UNION ALL
SELECT 'std_cpu', 0.0, 1.0 FROM stats s UNION ALL
SELECT 'avg_iowait', s.m_avg_iowait, GREATEST(s.s_avg_iowait, 1e-9) FROM stats s UNION ALL
SELECT 'std_iowait', 0.0, 1.0 FROM stats s UNION ALL
SELECT 'avg_softirq', s.m_avg_softirq, 1.0 FROM stats s UNION ALL
SELECT 'avg_ctx_switches', s.m_avg_ctx_switches, 1.0 FROM stats s UNION ALL
SELECT 'avg_interrupts', s.m_avg_interrupts, 1.0 FROM stats s UNION ALL
SELECT 'avg_softnet_dropped', s.m_avg_softnet_dropped, 1.0 FROM stats s UNION ALL
SELECT 'avg_softnet_squeeze', s.m_avg_softnet_squeeze, 1.0 FROM stats s UNION ALL
SELECT 'avg_net_in', s.m_avg_net_in, GREATEST(s.s_avg_net_in, 1e-9) FROM stats s UNION ALL
SELECT 'std_net_in', 0.0, 1.0 FROM stats s UNION ALL
SELECT 'avg_net_out', s.m_avg_net_out, GREATEST(s.s_avg_net_out, 1e-9) FROM stats s UNION ALL
SELECT 'std_net_out', 0.0, 1.0 FROM stats s UNION ALL
SELECT 'sum_tcp_retrans', s.m_sum_tcp_retrans, 1.0 FROM stats s UNION ALL
SELECT 'sum_tcp_fast_retrans', s.m_sum_tcp_fast_retrans, 1.0 FROM stats s UNION ALL
SELECT 'sum_tcp_timeouts', s.m_sum_tcp_timeouts, 1.0 FROM stats s UNION ALL
SELECT 'avg_net_drops', s.m_avg_net_drops, 1.0 FROM stats s UNION ALL
SELECT 'avg_disk_read', s.m_avg_disk_read, 1.0 FROM stats s UNION ALL
SELECT 'avg_disk_write', s.m_avg_disk_write, 1.0 FROM stats s UNION ALL
SELECT 'avg_disk_io_ticks', s.m_avg_disk_io_ticks, GREATEST(s.s_avg_disk_io_ticks, 1e-9) FROM stats s UNION ALL
SELECT 'std_disk_io_ticks', 0.0, 1.0 FROM stats s UNION ALL
SELECT 'error_count', 0.0, 1.0 UNION ALL
SELECT 'severity_score', 0.0, 1.0
) t`
}
func BuildFusionQuery(maskingPatterns []config.MaskingPattern, systemctlServices []string, windowInterval string) string {
numericCols := collectNumericCols(maskingPatterns)
paramCTE := ""
paramSelect := ""
paramJoin := ""
if len(numericCols) > 0 {
var aggs []string
for _, col := range numericCols {
aggs = append(aggs, fmt.Sprintf("AVG(%s) AS avg_%s", col, col))
paramSelect += fmt.Sprintf(", COALESCE(p.avg_%s, 0.0) AS avg_%s", col, col)
}
paramCTE = fmt.Sprintf(`, param_agg AS (SELECT time_bucket(INTERVAL '%s', event_time) AS ws, %s FROM log_params GROUP BY 1)`, windowInterval, strings.Join(aggs, ", "))
paramJoin = "LEFT JOIN param_agg p ON m.ws = p.ws"
}
svcCTE := ""
svcSelect := ""
svcJoin := ""
if len(systemctlServices) > 0 {
var svcAggs []string
for _, svc := range systemctlServices {
safeName := strings.ReplaceAll(strings.ReplaceAll(svc, ".", "_"), "-", "_")
svcAggs = append(svcAggs, fmt.Sprintf(`MODE(CASE WHEN active_state = 'active' THEN 1 WHEN active_state = 'failed' THEN -1 ELSE 0 END) AS state_%s`, safeName))
svcSelect += fmt.Sprintf(", COALESCE(s.state_%s, 0) AS svc_%s", safeName, safeName)
}
svcCTE = fmt.Sprintf(`, svc_agg AS (SELECT time_bucket(INTERVAL '%s', timestamp) AS ws, %s FROM service_status GROUP BY 1)`, windowInterval, strings.Join(svcAggs, ", "))
svcJoin = "LEFT JOIN svc_agg s ON m.ws = s.ws"
}
var scFields []string
for _, name := range scalerFeatureNames {
scFields = append(scFields, fmt.Sprintf("COALESCE(MAX(CASE WHEN feature_name='%s' THEN mean END),0) AS m_%s, COALESCE(MAX(CASE WHEN feature_name='%s' THEN std END),1) AS s_%s", name, name, name, name))
}
var normVecFields []string
for _, name := range scalerFeatureNames {
// DuckDB aggregation aliases match these exactly (see metric_agg and log_agg below)
src := name
if name == "severity_score" || name == "error_count" {
src = "l." + name
} else {
src = "m." + name
}
normVecFields = append(normVecFields, fmt.Sprintf("(COALESCE(%s, 0.0) - sc.m_%s) / sc.s_%s AS sc_%s", src, name, name, name))
}
return fmt.Sprintf(`
WITH metric_agg AS (
SELECT
time_bucket(INTERVAL '%[1]s', timestamp) AS ws,
AVG(cpu_percent) AS avg_cpu, MAX(cpu_percent) AS max_cpu, STDDEV_SAMP(cpu_percent) AS std_cpu,
AVG(cpu_iowait_percent) AS avg_iowait, STDDEV_SAMP(cpu_iowait_percent) AS std_iowait,
AVG(cpu_softirq_percent) AS avg_softirq, AVG(context_switches_s) AS avg_ctx_switches,
AVG(interrupts_s) AS avg_interrupts, AVG(softnet_dropped_s) AS avg_softnet_dropped,
AVG(softnet_time_squeeze_s) AS avg_softnet_squeeze,
AVG(memory_used_mb) AS avg_mem_used, AVG(memory_cached_mb) AS avg_mem_cached, MAX(memory_dirty_mb) AS max_mem_dirty,
AVG(net_in_mbps) AS avg_net_in, STDDEV_SAMP(net_in_mbps) AS std_net_in,
AVG(net_out_mbps) AS avg_net_out, STDDEV_SAMP(net_out_mbps) AS std_net_out,
SUM(tcp_retrans_s) AS sum_tcp_retrans, SUM(tcp_fast_retrans_s) AS sum_tcp_fast_retrans,
SUM(tcp_timeouts_s) AS sum_tcp_timeouts, AVG(network_drops_s) AS avg_net_drops,
AVG(disk_read_mbps) AS avg_disk_read, AVG(disk_write_mbps) AS avg_disk_write,
AVG(disk_io_ticks_s) AS avg_disk_io_ticks, STDDEV_SAMP(disk_io_ticks_s) AS std_disk_io_ticks,
SUM(disk_read_time_s) AS sum_disk_read_time, SUM(disk_write_time_s) AS sum_disk_write_time,
SUM(disk_reads_s) AS sum_disk_reads, SUM(disk_writes_s) AS sum_disk_writes,
SUM(net_packets_in_s) AS sum_packets_in, SUM(net_packets_out_s) AS sum_packets_out
FROM raw_metrics GROUP BY 1
),
log_agg AS (
SELECT
time_bucket(INTERVAL '%[1]s', timestamp) AS ws,
COUNT(*) AS log_event_count, COUNT(DISTINCT template_id) AS unique_templates,
SUM(CASE WHEN severity = 'ERROR' THEN 1 ELSE 0 END) AS error_count,
SUM(CASE
WHEN severity = 'ERROR' THEN 10
WHEN severity = 'WARN' THEN 3
ELSE 1
END) AS severity_score
FROM log_events GROUP BY 1
)%[2]s%[3]s,
scaler AS (
SELECT %[4]s FROM scaler_params
)
SELECT m.ws,
m.*, l.log_event_count, l.unique_templates, l.error_count, l.severity_score%[5]s%[6]s,
%[7]s
FROM metric_agg m
LEFT JOIN log_agg l ON m.ws = l.ws
%[8]s %[9]s
CROSS JOIN scaler sc
ORDER BY m.ws DESC LIMIT 1`,
windowInterval, paramCTE, svcCTE, strings.Join(scFields, ", "), paramSelect, svcSelect, strings.Join(normVecFields, ", "), paramJoin, svcJoin)
}
func BuildLogParamsSchema(patterns []config.MaskingPattern) string {
cols := []string{"event_time TIMESTAMP WITH TIME ZONE"}
for _, mp := range patterns {
if mp.Name == "" {
continue
}
cols = append(cols, fmt.Sprintf("param_%s %s", mp.Name, sqlType(mp.Type)))
}
return fmt.Sprintf("CREATE TABLE IF NOT EXISTS log_params (\n\t%s\n)", strings.Join(cols, ",\n\t"))
}
func sqlType(t string) string {
switch t {
case "float":
return "DOUBLE"
case "int":
return "BIGINT"
default:
return "VARCHAR"
}
}
func collectNumericCols(patterns []config.MaskingPattern) []string {
var cols []string
for _, mp := range patterns {
if mp.Name == "" || mp.Type == "string" {
continue
}
cols = append(cols, "param_"+mp.Name)
}
return cols
}

302
pkg/types/types.go Normal file
View file

@ -0,0 +1,302 @@
// Package types defines the shared data structures that flow between pipeline
// stages. All types are value-safe to copy and JSON-serialisable.
package types
import "time"
// ── LogEvent ─────────────────────────────────────────────────────────────────
// LogEvent represents a single parsed log line after Drain3 template mining.
type LogEvent struct {
Timestamp time.Time `json:"timestamp"`
TemplateID int `json:"template_id"`
Params map[string]string `json:"params"`
Severity string `json:"severity"`
RawLine string `json:"raw_line"`
}
// ── ServiceStatus ─────────────────────────────────────────────────────────────
// ServiceStatus represents the state of a systemd service.
type ServiceStatus struct {
Timestamp time.Time `json:"timestamp"`
ServiceName string `json:"service_name"`
ActiveState string `json:"active_state"` // e.g. "active", "inactive", "failed"
SubState string `json:"sub_state"` // e.g. "running", "dead", "exited"
}
// ── MetricSnapshot ────────────────────────────────────────────────────────────
// MetricSnapshot is a 1 Hz sample of Linux system metrics collected from /proc.
type MetricSnapshot struct {
Timestamp time.Time `json:"timestamp"`
CPUPercent float64 `json:"cpu_percent"`
CPUIoWaitPercent float64 `json:"cpu_iowait_percent"`
CPUSoftIrqPercent float64 `json:"cpu_softirq_percent"`
ContextSwitchesPerS float64 `json:"context_switches_s"`
InterruptsPerS float64 `json:"interrupts_s"`
MemoryUsedMB float64 `json:"memory_used_mb"`
MemoryCachedMB float64 `json:"memory_cached_mb"`
MemoryDirtyMB float64 `json:"memory_dirty_mb"`
NetworkInMBps float64 `json:"net_in_mbps"`
NetworkOutMBps float64 `json:"net_out_mbps"`
NetErrorsPerS float64 `json:"network_errors_s"`
NetDropsPerS float64 `json:"network_drops_s"`
TCPRetransPerS float64 `json:"tcp_retrans_s"`
TCPTimeoutsPerS float64 `json:"tcp_timeouts_s"`
TCPLostRetransmitPerS float64 `json:"tcp_lost_retransmit_s"`
TCPFastRetransPerS float64 `json:"tcp_fast_retrans_s"`
DiskReadMBps float64 `json:"disk_read_mbps"`
DiskWriteMBps float64 `json:"disk_write_mbps"`
DiskReadTimeMsPerS float64 `json:"disk_read_time_s"`
DiskWriteTimeMsPerS float64 `json:"disk_write_time_s"`
DiskIOTicksPerS float64 `json:"disk_io_ticks_s"`
SoftnetDroppedPerS float64 `json:"softnet_dropped_s"`
SoftnetTimeSqueezePerS float64 `json:"softnet_time_squeeze_s"`
NetPacketsInPerS float64 `json:"net_packets_in_s"`
NetPacketsOutPerS float64 `json:"net_packets_out_s"`
DiskReadsCompletedPerS float64 `json:"disk_reads_s"`
DiskWritesCompletedPerS float64 `json:"disk_writes_s"`
}
// ── FeatureVector ─────────────────────────────────────────────────────────────
// FeatureVector is the output of the DuckDB Tumbling-Window fusion layer.
//
// # NormalizedVector layout
//
// Slot 0 4: CPU (DuckDB RobustScaled)
// 0=avg_cpu 1=max_cpu 2=avg_iowait 3=avg_softirq 4=avg_ctx_switches
// Slot 5 7: Memory (DuckDB RobustScaled)
// 5=avg_mem_used 6=avg_mem_cached 7=max_mem_dirty
// Slot 8: Disk (DuckDB RobustScaled)
// 8=avg_disk_io_ticks
// Slot 912: Network (DuckDB RobustScaled)
// 9=avg_net_in 10=avg_net_out 11=avg_net_drops 12=avg_softnet_squeeze
// Slot 1316: TCP (DuckDB RobustScaled)
// 13=max_tcp_retrans 14=sum_tcp_fast_retrans
// 15=sum_tcp_timeouts 16=sum_tcp_lost_retrans
// Slot 1720: Log (DuckDB RobustScaled)
// 17=log_event_count 18=error_count 19=unique_templates 20=error_rate
// Slot 21: CPUDelta Δavg_cpu vs previous window, %-points (unscaled)
// Slot 22: RatioTCPNet sum_tcp_retrans / (avg_net_out + 1e-3), CV=10 (NEW)
// Slot 23: DeltaCtx Δavg_ctx_switches vs previous window, CV=6.2 (NEW)
// Slot 24: NetDelta Δavg_net_out vs previous window, MBps (unscaled)
// Slot 25: CPURollStd rolling σ(avg_cpu, 12 windows) (unscaled)
// Slot 26: CPUEfficiency avg_cpu / (avg_net_out + 1) (unscaled)
// Slot 27: IOWaitProxy avg_disk_io_ticks / (avg_cpu + 1) (unscaled)
// Slot 28: LogDensity unique_templates / (log_count + 1) (unscaled)
// Slot 29: DeltaNetIn Δavg_net_in vs previous window, MBps (unscaled)
// Slot 30: DeltaTCPRetrans Δsum_tcp_retrans vs previous window (unscaled)
// Slot 31: TcpRollStd rolling σ(sum_tcp_retrans, 5 windows) (unscaled)
// Slot 32: NetRollStd rolling σ(avg_net_out, 5 windows) (unscaled)
// Slot 33: MemPressure avg_dirty_mb / (avg_mem_used + 1) (unscaled)
// Slot 34: NetAsymmetry avg_net_in / (avg_net_out + 1e-3) (unscaled)
// Slot 35+: Drain param averages (unscaled)
type FeatureVector struct {
Timestamp time.Time `json:"timestamp"`
WindowStart time.Time `json:"window_start"`
WindowEnd time.Time `json:"window_end"`
// CPU aggregations
AvgCPUPercent float64 `json:"avg_cpu"`
MaxCPUPercent float64 `json:"max_cpu"`
StdCPUPercent float64 `json:"std_cpu"`
AvgCPUIoWait float64 `json:"avg_iowait"`
StdCPUIoWait float64 `json:"std_iowait"`
AvgCPUSoftIrq float64 `json:"avg_softirq"`
AvgCtxSwitches float64 `json:"avg_ctx_switches"`
AvgInterrupts float64 `json:"avg_interrupts"`
// Memory aggregations
AvgMemUsedMB float64 `json:"avg_mem_used"`
AvgMemCachedMB float64 `json:"avg_mem_cached"`
MaxMemDirtyMB float64 `json:"max_mem_dirty"`
// Disk aggregations
AvgDiskIOTicks float64 `json:"avg_disk_io_ticks"`
StdDiskIOTicks float64 `json:"std_disk_io_ticks"`
AvgDiskReadMBps float64 `json:"avg_disk_read"`
AvgDiskWriteMBps float64 `json:"avg_disk_write"`
// Network aggregations
AvgNetInMBps float64 `json:"avg_net_in"`
StdNetInMBps float64 `json:"std_net_in"`
AvgNetOutMBps float64 `json:"avg_net_out"`
StdNetOutMBps float64 `json:"std_net_out"`
AvgNetDrops float64 `json:"avg_net_drops"`
AvgSoftnetDropped float64 `json:"avg_softnet_dropped"`
AvgSoftnetSqueeze float64 `json:"avg_softnet_squeeze"`
// TCP aggregations
SumTCPRetrans float64 `json:"sum_tcp_retrans"`
SumTCPFastRetrans float64 `json:"sum_tcp_fast_retrans"`
SumTCPTimeouts float64 `json:"sum_tcp_timeouts"`
// Log aggregations
ErrorCount int `json:"error_count"`
SeverityScore float64 `json:"severity_score"`
// Engineered / Derived features
CPUDelta float64 `json:"cpu_delta"`
CPURollStd float64 `json:"cpu_roll_std"`
CPUEfficiency float64 `json:"cpu_efficiency"`
DeltaCtx float64 `json:"delta_ctx"`
NetDelta float64 `json:"net_delta"`
AvgNetThroughput float64 `json:"avg_net_throughput"`
CPUPerMB float64 `json:"cpu_per_mb"`
NetworkDiskRatio float64 `json:"network_disk_ratio"`
RetransPerPacket float64 `json:"retrans_per_packet"`
RetransPerMB float64 `json:"retrans_per_mb"`
AvgDiskLatencyMS float64 `json:"avg_disk_latency_ms"`
LogCountTotal int `json:"log_count_total"`
UniqueTemplates int `json:"unique_templates"`
LogDensity float64 `json:"log_density"`
IOWaitProxy float64 `json:"io_wait_proxy"`
DeltaNetIn float64 `json:"delta_net_in"`
DeltaTCPRetrans float64 `json:"delta_tcp_retrans"`
TcpRollStd float64 `json:"tcp_roll_std"`
NetRollStd float64 `json:"net_roll_std"`
MemPressure float64 `json:"mem_pressure"`
NetAsymmetry float64 `json:"net_asymmetry"`
// Drain parameter aggregations
ParamAvg map[string]float64 `json:"param_avg"`
// ServiceStatuses maps service names to their encoded state (active=1, inactive=0, failed=-1).
ServiceStatuses map[string]float64 `json:"service_statuses"`
// NormalizedVector is the flat float64 slice consumed by anomaly detectors.
NormalizedVector []float64 `json:"normalized_vector"`
}
// ToFloatSlice serialises fv to a deterministic []float64 for offline EDA.
// Returns raw (unscaled) values; use NormalizedVector for ML inference.
//
// [avg_cpu, max_cpu, std_cpu,
// avg_iowait, std_iowait, avg_softirq, avg_ctx_switches, avg_interrupts,
// avg_softnet_dropped, avg_softnet_squeeze,
// avg_net_in, std_net_in, avg_net_out, std_net_out,
// sum_tcp_retrans, sum_tcp_fast_retrans, sum_tcp_timeouts, avg_net_drops,
// avg_disk_read, avg_disk_write, avg_disk_io_ticks, std_disk_io_ticks,
// error_count, severity_score,
// cpu_delta, cpu_roll_std, cpu_efficiency, delta_ctx, net_delta,
// avg_net_throughput, cpu_per_mb, network_disk_ratio, retrans_per_packet,
// retrans_per_mb, avg_disk_latency_ms, log_count_total, unique_templates,
// log_density, io_wait_proxy, delta_net_in, delta_tcp_retrans,
// tcp_roll_std, net_roll_std, mem_pressure, net_asymmetry,
// param_*]
func (fv FeatureVector) ToFloatSlice(paramNames []string) []float64 {
out := make([]float64, 0, 45+len(paramNames))
out = append(out,
// Base Aggregates (24)
fv.AvgCPUPercent, fv.MaxCPUPercent, fv.StdCPUPercent,
fv.AvgCPUIoWait, fv.StdCPUIoWait, fv.AvgCPUSoftIrq, fv.AvgCtxSwitches, fv.AvgInterrupts,
fv.AvgSoftnetDropped, fv.AvgSoftnetSqueeze,
fv.AvgNetInMBps, fv.StdNetInMBps, fv.AvgNetOutMBps, fv.StdNetOutMBps,
fv.SumTCPRetrans, fv.SumTCPFastRetrans, fv.SumTCPTimeouts, fv.AvgNetDrops,
fv.AvgDiskReadMBps, fv.AvgDiskWriteMBps, fv.AvgDiskIOTicks, fv.StdDiskIOTicks,
float64(fv.ErrorCount), fv.SeverityScore,
// Engineered Features (21)
fv.CPUDelta, fv.CPURollStd, fv.CPUEfficiency, fv.DeltaCtx, fv.NetDelta,
fv.AvgNetThroughput, fv.CPUPerMB, fv.NetworkDiskRatio, fv.RetransPerPacket,
fv.RetransPerMB, fv.AvgDiskLatencyMS, float64(fv.LogCountTotal),
float64(fv.UniqueTemplates), fv.LogDensity, fv.IOWaitProxy,
fv.DeltaNetIn, fv.DeltaTCPRetrans, fv.TcpRollStd, fv.NetRollStd,
fv.MemPressure, fv.NetAsymmetry,
)
for _, name := range paramNames {
out = append(out, fv.ParamAvg[name])
}
return out
}
// ToNamedMap returns the feature vector as map[string]float64
func (fv FeatureVector) ToNamedMap(paramNames []string) map[string]float64 {
m := map[string]float64{
"avg_cpu": fv.AvgCPUPercent,
"max_cpu": fv.MaxCPUPercent,
"std_cpu": fv.StdCPUPercent,
"avg_iowait": fv.AvgCPUIoWait,
"std_iowait": fv.StdCPUIoWait,
"avg_softirq": fv.AvgCPUSoftIrq,
"avg_ctx_switches": fv.AvgCtxSwitches,
"avg_interrupts": fv.AvgInterrupts,
"avg_softnet_dropped": fv.AvgSoftnetDropped,
"avg_softnet_squeeze": fv.AvgSoftnetSqueeze,
"avg_net_in": fv.AvgNetInMBps,
"std_net_in": fv.StdNetInMBps,
"avg_net_out": fv.AvgNetOutMBps,
"std_net_out": fv.StdNetOutMBps,
"avg_net_drops": fv.AvgNetDrops,
"sum_tcp_retrans": fv.SumTCPRetrans,
"sum_tcp_fast_retrans": fv.SumTCPFastRetrans,
"sum_tcp_timeouts": fv.SumTCPTimeouts,
"avg_disk_read": fv.AvgDiskReadMBps,
"avg_disk_write": fv.AvgDiskWriteMBps,
"avg_disk_io_ticks": fv.AvgDiskIOTicks,
"std_disk_io_ticks": fv.StdDiskIOTicks,
"error_count": float64(fv.ErrorCount),
"severity_score": fv.SeverityScore,
"cpu_delta": fv.CPUDelta,
"cpu_roll_std": fv.CPURollStd,
"cpu_efficiency": fv.CPUEfficiency,
"delta_ctx": fv.DeltaCtx,
"net_delta": fv.NetDelta,
"avg_net_throughput": fv.AvgNetThroughput,
"cpu_per_mb": fv.CPUPerMB,
"network_disk_ratio": fv.NetworkDiskRatio,
"retrans_per_packet": fv.RetransPerPacket,
"retrans_per_mb": fv.RetransPerMB,
"avg_disk_latency_ms": fv.AvgDiskLatencyMS,
"log_count_total": float64(fv.LogCountTotal),
"unique_templates": float64(fv.UniqueTemplates),
"log_density": fv.LogDensity,
"io_wait_proxy": fv.IOWaitProxy,
"delta_net_in": fv.DeltaNetIn,
"delta_tcp_retrans": fv.DeltaTCPRetrans,
"tcp_roll_std": fv.TcpRollStd,
"net_roll_std": fv.NetRollStd,
"mem_pressure": fv.MemPressure,
"net_asymmetry": fv.NetAsymmetry,
}
for _, name := range paramNames {
m["avg_param_"+name] = fv.ParamAvg[name]
}
return m
}
// ── AnomalyResult ─────────────────────────────────────────────────────────────
// AnomalyResult is the final output of the detection layer.
type AnomalyResult struct {
Timestamp time.Time `json:"timestamp"`
Score float64 `json:"score"`
IsAnomaly bool `json:"is_anomaly"`
Confidence float64 `json:"confidence"`
Method string `json:"method"`
Details string `json:"details,omitempty"`
}
// ── StageHealth ───────────────────────────────────────────────────────────────
// StageHealth stores per-stage monitoring counters.
type StageHealth struct {
StageName string `json:"stage_name"`
EventsProcessed uint64 `json:"events_processed"`
EventsDropped uint64 `json:"events_dropped"`
AvgLatencyMs float64 `json:"avg_latency_ms"`
Throughput float64 `json:"throughput_eps"`
LastUpdate time.Time `json:"last_update"`
}