commit for version used in evaluation of thesis
This commit is contained in:
commit
72635dc7b9
27 changed files with 6084 additions and 0 deletions
50
Makefile
Normal file
50
Makefile
Normal file
|
|
@ -0,0 +1,50 @@
|
||||||
|
BINARY := guenther
|
||||||
|
BUILD_DIR := build
|
||||||
|
CMD := ./cmd/pipeline/main.go
|
||||||
|
CONFIG := configs/default.yaml
|
||||||
|
|
||||||
|
GO_IMAGE := golang:bookworm
|
||||||
|
BUILD_TAGS := duckdb_arrow
|
||||||
|
LDFLAGS := -s -w
|
||||||
|
|
||||||
|
GO_BUILD_FLAGS := -tags=$(BUILD_TAGS) -buildvcs=false -ldflags='$(LDFLAGS)'
|
||||||
|
|
||||||
|
# ── Targets ───────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
.PHONY: all build build-local test clean run help
|
||||||
|
|
||||||
|
all: build
|
||||||
|
|
||||||
|
## build: Build the binary inside a Docker container (no local toolchain needed)
|
||||||
|
build:
|
||||||
|
@mkdir -p $(BUILD_DIR)
|
||||||
|
docker run --rm \
|
||||||
|
-v $(PWD):/app:Z \
|
||||||
|
-w /app \
|
||||||
|
$(GO_IMAGE) \
|
||||||
|
sh -c "apt-get update -qq && \
|
||||||
|
apt-get install -y -qq gcc libc6-dev && \
|
||||||
|
CGO_ENABLED=1 go build $(GO_BUILD_FLAGS) -o $(BUILD_DIR)/$(BINARY) $(CMD) && \
|
||||||
|
echo BUILD_OK" \
|
||||||
|
2>&1
|
||||||
|
|
||||||
|
## build-local: Build the binary using the local Go toolchain (requires gcc)
|
||||||
|
build-local:
|
||||||
|
@mkdir -p $(BUILD_DIR)
|
||||||
|
CGO_ENABLED=1 go build $(GO_BUILD_FLAGS) -o $(BUILD_DIR)/$(BINARY) $(CMD)
|
||||||
|
|
||||||
|
## test: Run all tests (requires local Go toolchain with gcc)
|
||||||
|
test:
|
||||||
|
CGO_ENABLED=1 go test -v -tags=$(BUILD_TAGS) ./...
|
||||||
|
|
||||||
|
## run: Run the pipeline with the default config (binary must be built first)
|
||||||
|
run: $(BUILD_DIR)/$(BINARY)
|
||||||
|
./$(BUILD_DIR)/$(BINARY) -config $(CONFIG)
|
||||||
|
|
||||||
|
## clean: Remove build artefacts
|
||||||
|
clean:
|
||||||
|
rm -rf $(BUILD_DIR)
|
||||||
|
|
||||||
|
## help: Show this help message
|
||||||
|
help:
|
||||||
|
@grep -E '^## ' $(MAKEFILE_LIST) | sed 's/^## / /'
|
||||||
212
README.md
Normal file
212
README.md
Normal file
|
|
@ -0,0 +1,212 @@
|
||||||
|
# guenther
|
||||||
|
|
||||||
|
A streaming anomaly detection pipeline for Managed-File-Transfer (MFT) infrastructure.
|
||||||
|
guenther ingests system metrics and application logs in real time, extracts structured
|
||||||
|
feature vectors per time window, and scores them with an ensemble of unsupervised
|
||||||
|
detectors — without any labelled training data.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## How it works
|
||||||
|
|
||||||
|
```
|
||||||
|
┌─────────────────────────────────────────────────────────────┐
|
||||||
|
│ Ingestion │
|
||||||
|
│ MetricCollector (/proc) LogCollector (inotify + Drain3) │
|
||||||
|
│ SystemctlCollector (service states) │
|
||||||
|
└────────────────────┬────────────────────────────────────────┘
|
||||||
|
│ channels (backpressure)
|
||||||
|
┌────────────────────▼────────────────────────────────────────┐
|
||||||
|
│ Transformation │
|
||||||
|
│ TransformEngine – 30 s tumbling windows via DuckDB │
|
||||||
|
│ 45 base features + N Drain3 parameter aggregates │
|
||||||
|
└────────────────────┬────────────────────────────────────────┘
|
||||||
|
│
|
||||||
|
┌────────────────────▼────────────────────────────────────────┐
|
||||||
|
│ Detection │
|
||||||
|
│ EnsembleDetector (RRCF fast/mid/slow · COPOD · MAD) │
|
||||||
|
│ SEAD online weight adaptation · auto-scaling (3 stages) │
|
||||||
|
└────────────────────┬────────────────────────────────────────┘
|
||||||
|
│
|
||||||
|
anomalies.jsonl
|
||||||
|
```
|
||||||
|
|
||||||
|
### Packages
|
||||||
|
|
||||||
|
| Path | Responsibility |
|
||||||
|
| -------------------- | -------------------------------------------------------------------------------- |
|
||||||
|
| `cmd/pipeline` | Entry point, wiring, graceful shutdown |
|
||||||
|
| `internal/collector` | `MetricCollector` (`/proc`), `LogCollector` (inotify), `SystemctlCollector` |
|
||||||
|
| `internal/transform` | `TransformEngine` — DuckDB windowed aggregation |
|
||||||
|
| `internal/detect` | `EnsembleDetector`, RRCF, COPOD, MAD, IsolationForest, SEAD, `ScalingController` |
|
||||||
|
| `internal/drain3` | Masking / parameter extraction wrapper around Drain3 |
|
||||||
|
| `internal/config` | YAML config loading and regex compilation |
|
||||||
|
| `internal/health` | `HealthMonitor` — per-stage counters |
|
||||||
|
| `pkg/types` | Shared types: `LogEvent`, `MetricSnapshot`, `FeatureVector`, `AnomalyResult` |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Requirements
|
||||||
|
|
||||||
|
| Dependency | Notes |
|
||||||
|
| --------------- | ------------------------------------------------------------ |
|
||||||
|
| Docker | Required for the containerised build (recommended) |
|
||||||
|
| Go ≥ 1.25 | Only needed for local builds |
|
||||||
|
| gcc / libc6-dev | CGO is required by `go-duckdb` |
|
||||||
|
| Linux | Metric collection reads `/proc`; not supported on other OSes |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Building
|
||||||
|
|
||||||
|
### Docker (recommended — no local toolchain needed)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
make build
|
||||||
|
```
|
||||||
|
|
||||||
|
The binary is written to `build/guenther`.
|
||||||
|
|
||||||
|
### Local (requires Go + gcc)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
make build-local
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Running
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./build/guenther -config configs/default.yaml
|
||||||
|
```
|
||||||
|
|
||||||
|
guenther shuts down cleanly on `SIGINT` or `SIGTERM`.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Testing
|
||||||
|
|
||||||
|
```bash
|
||||||
|
make test
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
|
||||||
|
guenther is configured via a single YAML file (default: `configs/default.yaml`).
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
ingestion:
|
||||||
|
log_path: "/path/to/log/file/transfer.log" # file to tail
|
||||||
|
net_interface: "ens4" # interface for /proc/net/dev
|
||||||
|
disk_device: "vda1" # device for /proc/diskstats
|
||||||
|
systemctl_services:
|
||||||
|
- service1.service
|
||||||
|
- service2.service
|
||||||
|
|
||||||
|
transformation:
|
||||||
|
window_size: "30s" # tumbling window length
|
||||||
|
db_path: "data/pipeline.duckdb" # DuckDB file (use :memory: for ephemeral)
|
||||||
|
|
||||||
|
drain:
|
||||||
|
depth: 4
|
||||||
|
sim_threshold: 0.4
|
||||||
|
max_children: 100
|
||||||
|
max_clusters: 1000
|
||||||
|
masking_patterns: # applied in order before template mining
|
||||||
|
- name: "uuid"
|
||||||
|
pattern: '\b[0-9a-fA-F]{8}-...\b'
|
||||||
|
replace: "<UUID>"
|
||||||
|
type: "string"
|
||||||
|
# ... see configs/default.yaml for the full set
|
||||||
|
|
||||||
|
detector:
|
||||||
|
method: "ensemble" # fallback when ensemble.enabled = false
|
||||||
|
ensemble:
|
||||||
|
enabled: true
|
||||||
|
method: "sead" # avg | max | median | sead
|
||||||
|
contamination: 0.15
|
||||||
|
sead:
|
||||||
|
eta: 0.1
|
||||||
|
lambda: 0.01
|
||||||
|
auto_scaling:
|
||||||
|
enabled: true
|
||||||
|
high_threshold: 75.0 # CPU % → switch to mid detector
|
||||||
|
critical_threshold: 90.0 # CPU % → switch to fast detector
|
||||||
|
down_threshold: 50.0
|
||||||
|
high_duration: 90.0 # seconds load must persist before scaling
|
||||||
|
critical_duration: 120.0
|
||||||
|
down_duration: 120.0
|
||||||
|
rrcf_variants:
|
||||||
|
fast: { num_trees: 50, tree_size: 32, threshold_percentile: 0.85 }
|
||||||
|
mid: { num_trees: 150, tree_size: 64, threshold_percentile: 0.85 }
|
||||||
|
slow: { num_trees: 200, tree_size: 128, threshold_percentile: 0.85 }
|
||||||
|
copod:
|
||||||
|
buffer_size: 50
|
||||||
|
threshold: 0.3
|
||||||
|
mad:
|
||||||
|
threshold: 3.5
|
||||||
|
calibration_size: 50
|
||||||
|
|
||||||
|
output:
|
||||||
|
feature_log_path: "logs/features.jsonl"
|
||||||
|
anomaly_log_path: "logs/anomalies.jsonl"
|
||||||
|
```
|
||||||
|
|
||||||
|
### Masking pattern types
|
||||||
|
|
||||||
|
Patterns with `type: float` extract a named parameter into `FeatureVector.ParamAvg`;
|
||||||
|
patterns with `type: string` replace the match in-place before template mining.
|
||||||
|
Named patterns (`name != ""`) are aggregated as features per window.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Output
|
||||||
|
|
||||||
|
**`logs/anomalies.jsonl`** — one JSON object per scored window:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"timestamp": "2026-01-15T14:32:00Z",
|
||||||
|
"score": 0.8721,
|
||||||
|
"is_anomaly": true,
|
||||||
|
"confidence": 0.91,
|
||||||
|
"method": "sead_ensemble",
|
||||||
|
"details": "rrcf_slow=0.91 copod=0.83 mad=0.78"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**`logs/features.jsonl`** — raw feature vectors for offline analysis (optional).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Project layout
|
||||||
|
|
||||||
|
```
|
||||||
|
guenther/
|
||||||
|
├── cmd/
|
||||||
|
│ └── pipeline/
|
||||||
|
│ └── main.go
|
||||||
|
├── internal/
|
||||||
|
│ ├── collector/
|
||||||
|
│ ├── config/
|
||||||
|
│ ├── detect/
|
||||||
|
│ ├── drain3/
|
||||||
|
│ ├── health/
|
||||||
|
│ └── transform/
|
||||||
|
├── pkg/
|
||||||
|
│ └── types/
|
||||||
|
├── configs/
|
||||||
|
│ └── default.yaml
|
||||||
|
├── build/ # created by `make build`
|
||||||
|
├── Makefile
|
||||||
|
└── README.md
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## License
|
||||||
|
|
||||||
|
This project was developed as part of a Bachelor's thesis.
|
||||||
294
cmd/pipeline/main.go
Normal file
294
cmd/pipeline/main.go
Normal file
|
|
@ -0,0 +1,294 @@
|
||||||
|
// Command pipeline is the entry point for the MFT anomaly detection pipeline.
|
||||||
|
//
|
||||||
|
// Startup order:
|
||||||
|
// 1. Load and compile config (masking patterns → *regexp.Regexp).
|
||||||
|
// 2. Allocate channels with fixed capacities to enable backpressure.
|
||||||
|
// 3. Start HealthMonitor.
|
||||||
|
// 4. Start collectors (MetricCollector, LogCollector).
|
||||||
|
// 5. Start TransformEngine (DuckDB, schema, pre-compiled query).
|
||||||
|
// 6. Start DetectionLayer.
|
||||||
|
// 7. Start anomaly sink goroutine.
|
||||||
|
// 8. Wait for SIGINT / SIGTERM.
|
||||||
|
// 9. Graceful shutdown in reverse order.
|
||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bufio"
|
||||||
|
"context"
|
||||||
|
"encoding/json"
|
||||||
|
"flag"
|
||||||
|
"fmt"
|
||||||
|
"log"
|
||||||
|
"os"
|
||||||
|
"os/signal"
|
||||||
|
"sync"
|
||||||
|
"syscall"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"codeberg.org/pata1704/guenther/internal/collector"
|
||||||
|
"codeberg.org/pata1704/guenther/internal/config"
|
||||||
|
"codeberg.org/pata1704/guenther/internal/detect"
|
||||||
|
"codeberg.org/pata1704/guenther/internal/health"
|
||||||
|
"codeberg.org/pata1704/guenther/internal/transform"
|
||||||
|
"codeberg.org/pata1704/guenther/pkg/types"
|
||||||
|
)
|
||||||
|
|
||||||
|
func main() {
|
||||||
|
cfgPath := flag.String("config", "configs/default.yaml", "path to config file")
|
||||||
|
flag.Parse()
|
||||||
|
|
||||||
|
cfg, err := config.LoadConfig(*cfgPath)
|
||||||
|
if err != nil {
|
||||||
|
log.Fatalf("load config %q: %v", *cfgPath, err)
|
||||||
|
}
|
||||||
|
if err := cfg.Compile(); err != nil {
|
||||||
|
log.Fatalf("compile masking patterns: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx, cancel := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM)
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
|
logChan := make(chan types.LogEvent, 1_000)
|
||||||
|
metricChan := make(chan types.MetricSnapshot, 100)
|
||||||
|
serviceStatusChan := make(chan types.ServiceStatus, 100)
|
||||||
|
featureChan := make(chan types.FeatureVector, 10)
|
||||||
|
anomalyChan := make(chan types.AnomalyResult, 50)
|
||||||
|
|
||||||
|
hm := health.NewHealthMonitor()
|
||||||
|
hm.Start(ctx, 5*time.Second)
|
||||||
|
|
||||||
|
metricColl := collector.NewMetricCollector(
|
||||||
|
metricChan, hm.Chan(),
|
||||||
|
time.Second,
|
||||||
|
cfg.Ingestion.NetInterface,
|
||||||
|
cfg.Ingestion.DiskDevice,
|
||||||
|
)
|
||||||
|
logColl := collector.NewLogCollector(cfg, logChan, hm.Chan())
|
||||||
|
sysColl := collector.NewSystemctlCollector(
|
||||||
|
cfg.Ingestion.SystemctlServices,
|
||||||
|
5*time.Second,
|
||||||
|
serviceStatusChan,
|
||||||
|
hm.Chan(),
|
||||||
|
)
|
||||||
|
|
||||||
|
metricColl.Start(ctx)
|
||||||
|
if err := logColl.Start(ctx); err != nil {
|
||||||
|
log.Fatalf("start log collector: %v", err)
|
||||||
|
}
|
||||||
|
sysColl.Start(ctx)
|
||||||
|
|
||||||
|
engine, err := transform.NewTransformEngine(cfg, logChan, metricChan, serviceStatusChan, featureChan, hm.Chan())
|
||||||
|
if err != nil {
|
||||||
|
log.Fatalf("create transform engine: %v", err)
|
||||||
|
}
|
||||||
|
engine.Start(ctx)
|
||||||
|
|
||||||
|
detector, err := buildDetector(cfg)
|
||||||
|
if err != nil {
|
||||||
|
log.Fatalf("build detector: %v", err)
|
||||||
|
}
|
||||||
|
detLayer := detect.NewDetectionLayer(detector, featureChan, anomalyChan, hm.Chan())
|
||||||
|
|
||||||
|
if cfg.Detection.AutoScaling.Enabled {
|
||||||
|
if sd, ok := detector.(*detect.SwitchableDetector); ok {
|
||||||
|
sc := detect.NewScalingController(
|
||||||
|
sd,
|
||||||
|
cfg.Detection.AutoScaling.HighThreshold,
|
||||||
|
cfg.Detection.AutoScaling.CritThreshold,
|
||||||
|
cfg.Detection.AutoScaling.DownThreshold,
|
||||||
|
cfg.Detection.AutoScaling.HighDuration,
|
||||||
|
cfg.Detection.AutoScaling.CritDuration,
|
||||||
|
cfg.Detection.AutoScaling.DownDuration,
|
||||||
|
)
|
||||||
|
detLayer.SetScalingController(sc)
|
||||||
|
log.Println("detector: auto-scaling enabled")
|
||||||
|
} else {
|
||||||
|
log.Println("warning: auto-scaling requested but detector is not switchable (requires SEAD ensemble)")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
detLayer.Start(ctx)
|
||||||
|
|
||||||
|
anomalyLog := openLog(cfg.Output.AnomalyLogPath, "anomaly log")
|
||||||
|
if anomalyLog != nil {
|
||||||
|
defer anomalyLog.Close()
|
||||||
|
}
|
||||||
|
anomalyWriter := maybeWriter(anomalyLog)
|
||||||
|
|
||||||
|
var sinkWg sync.WaitGroup
|
||||||
|
sinkWg.Add(1)
|
||||||
|
go func() {
|
||||||
|
defer sinkWg.Done()
|
||||||
|
for res := range anomalyChan {
|
||||||
|
writeJSON(anomalyWriter, res)
|
||||||
|
if res.IsAnomaly {
|
||||||
|
log.Printf("[ANOMALY] time=%s score=%.4f method=%s details=%s",
|
||||||
|
res.Timestamp.Format(time.RFC3339), res.Score, res.Method, res.Details)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
// Optionally log SEAD weights periodically (when using SEAD ensemble).
|
||||||
|
if ens, ok := detector.(*detect.EnsembleDetector); ok {
|
||||||
|
go func() {
|
||||||
|
t := time.NewTicker(60 * time.Second)
|
||||||
|
defer t.Stop()
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
return
|
||||||
|
case <-t.C:
|
||||||
|
if ws := ens.WeightSummary(); ws != "" {
|
||||||
|
log.Printf("[SEAD weights] %s", ws)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
|
||||||
|
log.Println("pipeline started – waiting for SIGINT / SIGTERM")
|
||||||
|
<-ctx.Done()
|
||||||
|
log.Println("shutting down…")
|
||||||
|
|
||||||
|
metricColl.Wait()
|
||||||
|
logColl.Wait()
|
||||||
|
engine.Wait()
|
||||||
|
|
||||||
|
close(featureChan)
|
||||||
|
detLayer.Wait()
|
||||||
|
|
||||||
|
close(anomalyChan)
|
||||||
|
sinkWg.Wait()
|
||||||
|
|
||||||
|
hm.Wait()
|
||||||
|
log.Println("pipeline stopped")
|
||||||
|
}
|
||||||
|
|
||||||
|
// buildDetector constructs the configured AnomalyDetector.
|
||||||
|
//
|
||||||
|
// Routing:
|
||||||
|
// 1. detector.ensemble.enabled = true → EnsembleDetector with the method
|
||||||
|
// specified by detector.ensemble.method ("avg"|"max"|"median"|"sead").
|
||||||
|
// 2. Otherwise fall through to detector.method ("copod"|"rrcf"|"isolation_forest").
|
||||||
|
func buildDetector(cfg *config.Config) (detect.AnomalyDetector, error) {
|
||||||
|
if cfg.Detection.Ensemble.Enabled {
|
||||||
|
method := detect.EnsembleMethod(cfg.Detection.Ensemble.Method)
|
||||||
|
if method == "" {
|
||||||
|
method = detect.EnsembleAVG // backward-compat default
|
||||||
|
}
|
||||||
|
|
||||||
|
// Map SEAD config from YAML to detect.SEADConfig.
|
||||||
|
seadCfg := detect.SEADConfig{
|
||||||
|
Eta: cfg.Detection.Ensemble.SEAD.Eta,
|
||||||
|
Lambda: cfg.Detection.Ensemble.SEAD.Lambda,
|
||||||
|
QuantileWindow: cfg.Detection.Ensemble.SEAD.QuantileWindow,
|
||||||
|
MinDataPoints: cfg.Detection.Ensemble.SEAD.MinDataPoints,
|
||||||
|
Contamination: cfg.Detection.Ensemble.Contamination,
|
||||||
|
}
|
||||||
|
// Apply defaults for zero-value fields.
|
||||||
|
if seadCfg.Eta == 0 {
|
||||||
|
seadCfg.Eta = 0.10
|
||||||
|
}
|
||||||
|
if seadCfg.QuantileWindow == 0 {
|
||||||
|
seadCfg.QuantileWindow = 300
|
||||||
|
}
|
||||||
|
if seadCfg.MinDataPoints == 0 {
|
||||||
|
seadCfg.MinDataPoints = 20
|
||||||
|
}
|
||||||
|
|
||||||
|
det, err := detect.NewEnsembleDetector(
|
||||||
|
method,
|
||||||
|
cfg.Detection.COPOD.BufferSize,
|
||||||
|
cfg.Detection.COPOD.Threshold,
|
||||||
|
detect.RRCFVariantsConfig{
|
||||||
|
Fast: detect.RRCFVariantConfig{
|
||||||
|
NumTrees: cfg.Detection.RRCFVariants.Fast.NumTrees,
|
||||||
|
TreeSize: cfg.Detection.RRCFVariants.Fast.TreeSize,
|
||||||
|
ThresholdPercentile: cfg.Detection.RRCFVariants.Fast.ThresholdPercentile,
|
||||||
|
},
|
||||||
|
Mid: detect.RRCFVariantConfig{
|
||||||
|
NumTrees: cfg.Detection.RRCFVariants.Mid.NumTrees,
|
||||||
|
TreeSize: cfg.Detection.RRCFVariants.Mid.TreeSize,
|
||||||
|
ThresholdPercentile: cfg.Detection.RRCFVariants.Mid.ThresholdPercentile,
|
||||||
|
},
|
||||||
|
Slow: detect.RRCFVariantConfig{
|
||||||
|
NumTrees: cfg.Detection.RRCFVariants.Slow.NumTrees,
|
||||||
|
TreeSize: cfg.Detection.RRCFVariants.Slow.TreeSize,
|
||||||
|
ThresholdPercentile: cfg.Detection.RRCFVariants.Slow.ThresholdPercentile,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
cfg.Detection.Ensemble.Contamination,
|
||||||
|
seadCfg,
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("build ensemble detector (%s): %w", method, err)
|
||||||
|
}
|
||||||
|
log.Printf("detector: Ensemble method=%s contamination=%.2f", method, cfg.Detection.Ensemble.Contamination)
|
||||||
|
if method == detect.EnsembleSEAD {
|
||||||
|
log.Printf("detector: SEAD η=%.3f λ=%.3f quantile_window=%d",
|
||||||
|
seadCfg.Eta, seadCfg.Lambda, seadCfg.QuantileWindow)
|
||||||
|
|
||||||
|
// Wrap in SwitchableDetector if using SEAD (required for 3-stage scaling).
|
||||||
|
if sead := det.SEAD(); sead != nil {
|
||||||
|
return detect.NewSwitchableDetector(sead), nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return det, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
switch cfg.Detection.Method {
|
||||||
|
case "copod":
|
||||||
|
return detect.NewCOPODDetector(
|
||||||
|
cfg.Detection.COPOD.BufferSize,
|
||||||
|
cfg.Detection.COPOD.Threshold,
|
||||||
|
)
|
||||||
|
case "rrcf":
|
||||||
|
return detect.NewRRCFDetector(
|
||||||
|
cfg.Detection.RRCF.NumTrees,
|
||||||
|
cfg.Detection.RRCF.TreeSize,
|
||||||
|
0,
|
||||||
|
cfg.Detection.RRCF.ThresholdPercentile,
|
||||||
|
), nil
|
||||||
|
default: // "isolation_forest"
|
||||||
|
return detect.NewIsolationForestDetector(
|
||||||
|
5_000, 100, 100, 256, 0.05, 10.0,
|
||||||
|
), nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func openLog(path, label string) *os.File {
|
||||||
|
if path == "" {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
f, err := os.OpenFile(path, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0o644)
|
||||||
|
if err != nil {
|
||||||
|
log.Printf("warning: cannot open %s %q: %v", label, path, err)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return f
|
||||||
|
}
|
||||||
|
|
||||||
|
func maybeWriter(f *os.File) *bufio.Writer {
|
||||||
|
if f == nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return bufio.NewWriterSize(f, 64*1024)
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeJSON(w *bufio.Writer, v any) {
|
||||||
|
if w == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
b, err := json.Marshal(v)
|
||||||
|
if err != nil {
|
||||||
|
log.Printf("marshal: %v", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if _, err := w.Write(append(b, '\n')); err != nil {
|
||||||
|
log.Printf("write log: %v", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if err := w.Flush(); err != nil {
|
||||||
|
log.Printf("flush log: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
123
configs/default.yaml
Normal file
123
configs/default.yaml
Normal file
|
|
@ -0,0 +1,123 @@
|
||||||
|
ingestion:
|
||||||
|
log_path: "/path/to/log/file/transfer.log"
|
||||||
|
net_interface: "ens4"
|
||||||
|
disk_device: "vda1"
|
||||||
|
systemctl_services:
|
||||||
|
- service1.service
|
||||||
|
- service2.service
|
||||||
|
|
||||||
|
transformation:
|
||||||
|
window_size: "30s"
|
||||||
|
db_path: "data/pipeline_test.duckdb"
|
||||||
|
|
||||||
|
drain:
|
||||||
|
depth: 4
|
||||||
|
sim_threshold: 0.4
|
||||||
|
max_children: 100
|
||||||
|
max_clusters: 1000
|
||||||
|
masking_patterns:
|
||||||
|
- name: "loglevel"
|
||||||
|
pattern: '^(\S+)'
|
||||||
|
replace: "<LOGLEVEL>"
|
||||||
|
type: "string"
|
||||||
|
|
||||||
|
- name: ""
|
||||||
|
pattern: '(\d{4}-\d{2}-\d{2})'
|
||||||
|
replace: "<DATE>"
|
||||||
|
type: "string"
|
||||||
|
|
||||||
|
- name: ""
|
||||||
|
pattern: '(\d{2}:\d{2}:\d{2}\.\d{6})'
|
||||||
|
replace: "<TIME>"
|
||||||
|
type: "string"
|
||||||
|
|
||||||
|
- name: "uuid"
|
||||||
|
pattern: '\b[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}\b'
|
||||||
|
replace: "<UUID>"
|
||||||
|
type: "string"
|
||||||
|
|
||||||
|
- name: ""
|
||||||
|
pattern: '\+\]'
|
||||||
|
replace: "<SESSION>"
|
||||||
|
type: "string"
|
||||||
|
|
||||||
|
- name: ""
|
||||||
|
pattern: "(/[a-zA-Z0-9._-]+)+"
|
||||||
|
replace: "<PATH>"
|
||||||
|
type: "string"
|
||||||
|
|
||||||
|
- name: ""
|
||||||
|
pattern: '(sync-file-reader|checksum|xp-network-(?:sender|receiver)|aes-crypt)-\d+:'
|
||||||
|
replace: "<MODULE>:"
|
||||||
|
type: "string"
|
||||||
|
|
||||||
|
- name: "datarate"
|
||||||
|
pattern: 'datarate=\s*(\d+(?:\.\d+)?)'
|
||||||
|
replace: "<datarate>"
|
||||||
|
type: "float"
|
||||||
|
|
||||||
|
- name: "duration"
|
||||||
|
pattern: 'duration=\s*(\d+(?:\.\d+)?)'
|
||||||
|
replace: "<duration>"
|
||||||
|
type: "float"
|
||||||
|
|
||||||
|
- name: "throughput"
|
||||||
|
pattern: 'throughput=\s*(\d+(?:\.\d+)?)'
|
||||||
|
replace: "<throughput>"
|
||||||
|
type: "float"
|
||||||
|
|
||||||
|
- name: "filesize"
|
||||||
|
pattern: '(\d+(?:\.\d+)?)\s*(?:MByte|GByte|MiB|GiB|GB|MB|KB)'
|
||||||
|
replace: "<filesize>"
|
||||||
|
type: "float"
|
||||||
|
|
||||||
|
- name: "hostport"
|
||||||
|
pattern: '([a-zA-Z0-9.-]+:\d+)'
|
||||||
|
replace: "<HOSTPORT>"
|
||||||
|
type: "string"
|
||||||
|
|
||||||
|
- name: ""
|
||||||
|
pattern: '\b(\d+(?:\.\d+)?)\b'
|
||||||
|
replace: "<NUM>"
|
||||||
|
type: "float"
|
||||||
|
|
||||||
|
detector:
|
||||||
|
method: "ensemble"
|
||||||
|
ensemble:
|
||||||
|
enabled: true
|
||||||
|
method: "sead"
|
||||||
|
contamination: 0.15
|
||||||
|
sead:
|
||||||
|
eta: 0.1
|
||||||
|
lambda: 0.01
|
||||||
|
auto_scaling:
|
||||||
|
enabled: true
|
||||||
|
high_threshold: 75.0
|
||||||
|
critical_threshold: 90.
|
||||||
|
high_duration: 90.0
|
||||||
|
critical_duration: 120.0
|
||||||
|
down_threshold: 50.0
|
||||||
|
down_duration: 120.0
|
||||||
|
rrcf_variants:
|
||||||
|
fast:
|
||||||
|
num_trees: 50
|
||||||
|
tree_size: 32
|
||||||
|
threshold_percentile: 0.85
|
||||||
|
mid:
|
||||||
|
num_trees: 150
|
||||||
|
tree_size: 64
|
||||||
|
threshold_percentile: 0.85
|
||||||
|
slow:
|
||||||
|
num_trees: 200
|
||||||
|
tree_size: 128
|
||||||
|
threshold_percentile: 0.85
|
||||||
|
copod:
|
||||||
|
buffer_size: 50
|
||||||
|
threshold: 0.3
|
||||||
|
mad:
|
||||||
|
threshold: 3.5
|
||||||
|
calibration_size: 50
|
||||||
|
|
||||||
|
output:
|
||||||
|
feature_log_path: "logs/features.jsonl"
|
||||||
|
anomaly_log_path: "logs/anomalies.jsonl"
|
||||||
49
go.mod
Normal file
49
go.mod
Normal file
|
|
@ -0,0 +1,49 @@
|
||||||
|
module codeberg.org/pata1704/guenther
|
||||||
|
|
||||||
|
go 1.25.5
|
||||||
|
|
||||||
|
require (
|
||||||
|
codeberg.org/pata1704/copod v0.0.0-20260308082005-aded842ae0c1
|
||||||
|
codeberg.org/pata1704/drain3 v1.0.0
|
||||||
|
codeberg.org/pata1704/rrcf v0.0.0-20260305123746-25e149fa69ba
|
||||||
|
github.com/apache/arrow-go/v18 v18.5.1
|
||||||
|
github.com/duckdb/duckdb-go/v2 v2.5.5
|
||||||
|
github.com/e-XpertSolutions/go-iforest v1.0.0
|
||||||
|
github.com/fsnotify/fsnotify v1.9.0
|
||||||
|
github.com/stretchr/testify v1.11.1
|
||||||
|
gopkg.in/yaml.v3 v3.0.1
|
||||||
|
)
|
||||||
|
|
||||||
|
require (
|
||||||
|
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
|
||||||
|
github.com/duckdb/duckdb-go-bindings v0.3.3 // indirect
|
||||||
|
github.com/duckdb/duckdb-go-bindings/lib/darwin-amd64 v0.3.3 // indirect
|
||||||
|
github.com/duckdb/duckdb-go-bindings/lib/darwin-arm64 v0.3.3 // indirect
|
||||||
|
github.com/duckdb/duckdb-go-bindings/lib/linux-amd64 v0.3.3 // indirect
|
||||||
|
github.com/duckdb/duckdb-go-bindings/lib/linux-arm64 v0.3.3 // indirect
|
||||||
|
github.com/duckdb/duckdb-go-bindings/lib/windows-amd64 v0.3.3 // indirect
|
||||||
|
github.com/dustin/go-humanize v1.0.1 // indirect
|
||||||
|
github.com/go-viper/mapstructure/v2 v2.5.0 // indirect
|
||||||
|
github.com/goccy/go-json v0.10.5 // indirect
|
||||||
|
github.com/google/flatbuffers v25.12.19+incompatible // indirect
|
||||||
|
github.com/google/uuid v1.6.0 // indirect
|
||||||
|
github.com/klauspost/compress v1.18.3 // indirect
|
||||||
|
github.com/klauspost/cpuid/v2 v2.3.0 // indirect
|
||||||
|
github.com/mattn/go-isatty v0.0.20 // indirect
|
||||||
|
github.com/ncruces/go-strftime v1.0.0 // indirect
|
||||||
|
github.com/pierrec/lz4/v4 v4.1.25 // indirect
|
||||||
|
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
|
||||||
|
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect
|
||||||
|
github.com/zeebo/xxh3 v1.1.0 // indirect
|
||||||
|
golang.org/x/exp v0.0.0-20260112195511-716be5621a96 // indirect
|
||||||
|
golang.org/x/mod v0.32.0 // indirect
|
||||||
|
golang.org/x/sync v0.19.0 // indirect
|
||||||
|
golang.org/x/sys v0.40.0 // indirect
|
||||||
|
golang.org/x/telemetry v0.0.0-20260116145544-c6413dc483f5 // indirect
|
||||||
|
golang.org/x/tools v0.41.0 // indirect
|
||||||
|
golang.org/x/xerrors v0.0.0-20240903120638-7835f813f4da // indirect
|
||||||
|
modernc.org/libc v1.67.6 // indirect
|
||||||
|
modernc.org/mathutil v1.7.1 // indirect
|
||||||
|
modernc.org/memory v1.11.0 // indirect
|
||||||
|
modernc.org/sqlite v1.44.1 // indirect
|
||||||
|
)
|
||||||
125
go.sum
Normal file
125
go.sum
Normal file
|
|
@ -0,0 +1,125 @@
|
||||||
|
codeberg.org/pata1704/copod v0.0.0-20260308082005-aded842ae0c1 h1:DoXV7m58nWibyIvVaUj4AVyVM/FN1SSpHuiuae+2Pa0=
|
||||||
|
codeberg.org/pata1704/copod v0.0.0-20260308082005-aded842ae0c1/go.mod h1:IchgVmiksba/DP7BjHiAYKoSrKTe3zrNrFO9QZWNxx0=
|
||||||
|
codeberg.org/pata1704/drain3 v1.0.0 h1:X66fn+lnzOMU+PFFSkNBF89z1ghbqihE1I4A6x/OJIM=
|
||||||
|
codeberg.org/pata1704/drain3 v1.0.0/go.mod h1:+K1hIYh3hNSPiXRxUin6ZiC2CC9FDGqQKNNR+7ZIx9s=
|
||||||
|
codeberg.org/pata1704/rrcf v0.0.0-20260305123746-25e149fa69ba h1:szOyiRopNELsHg9v/Tvif2292MGpgz+Hw9QqTMgildg=
|
||||||
|
codeberg.org/pata1704/rrcf v0.0.0-20260305123746-25e149fa69ba/go.mod h1:BmI1vkwcwL5tlRVfn3wEDZV+MXQbPMj8w7IsUhelrkA=
|
||||||
|
github.com/andybalholm/brotli v1.2.0 h1:ukwgCxwYrmACq68yiUqwIWnGY0cTPox/M94sVwToPjQ=
|
||||||
|
github.com/andybalholm/brotli v1.2.0/go.mod h1:rzTDkvFWvIrjDXZHkuS16NPggd91W3kUSvPlQ1pLaKY=
|
||||||
|
github.com/apache/arrow-go/v18 v18.5.1 h1:yaQ6zxMGgf9YCYw4/oaeOU3AULySDlAYDOcnr4LdHdI=
|
||||||
|
github.com/apache/arrow-go/v18 v18.5.1/go.mod h1:OCCJsmdq8AsRm8FkBSSmYTwL/s4zHW9CqxeBxEytkNE=
|
||||||
|
github.com/apache/thrift v0.22.0 h1:r7mTJdj51TMDe6RtcmNdQxgn9XcyfGDOzegMDRg47uc=
|
||||||
|
github.com/apache/thrift v0.22.0/go.mod h1:1e7J/O1Ae6ZQMTYdy9xa3w9k+XHWPfRvdPyJeynQ+/g=
|
||||||
|
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM=
|
||||||
|
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||||
|
github.com/duckdb/duckdb-go-bindings v0.3.3 h1:lXogtCY8hiGLQvTfK55HcgvaA3K2MrwKeZGqhIin35U=
|
||||||
|
github.com/duckdb/duckdb-go-bindings v0.3.3/go.mod h1:zS7OpBP8zwVlP38OljRZOnqWYlNd4KLcVfMoA1JFzpk=
|
||||||
|
github.com/duckdb/duckdb-go-bindings/lib/darwin-amd64 v0.3.3 h1:ue8BtIOSt+2Bt2fEfTAvBcQLxzBFhgfCcyzPtqQWTRA=
|
||||||
|
github.com/duckdb/duckdb-go-bindings/lib/darwin-amd64 v0.3.3/go.mod h1:EnAvZh1kNJHp5yF+M1ZHNEvapnmt6anq1xXHVrAGqMo=
|
||||||
|
github.com/duckdb/duckdb-go-bindings/lib/darwin-arm64 v0.3.3 h1:2TrSeTgtwi3WIvub9ba0mny+AClSNo1w0Ghszc2B8lQ=
|
||||||
|
github.com/duckdb/duckdb-go-bindings/lib/darwin-arm64 v0.3.3/go.mod h1:IGLSeEcFhNeZF16aVjQCULD7TsFZKG5G7SyKJAXKp5c=
|
||||||
|
github.com/duckdb/duckdb-go-bindings/lib/linux-amd64 v0.3.3 h1:GN0cexhfE7uLb7qgDmsYG324wKF15nW+O7v5+NGalS4=
|
||||||
|
github.com/duckdb/duckdb-go-bindings/lib/linux-amd64 v0.3.3/go.mod h1:KAIynZ0GHCS7X5fRyuFnQMg/SZBPK/bS9OCOVojClxw=
|
||||||
|
github.com/duckdb/duckdb-go-bindings/lib/linux-arm64 v0.3.3 h1:bIJV+ct6yvMXjy+N3bfILFd0fkTK50AUhUTerkY40/8=
|
||||||
|
github.com/duckdb/duckdb-go-bindings/lib/linux-arm64 v0.3.3/go.mod h1:81SGOYoEUs8qaAfSk1wRfM5oobrIJ5KI7AzYhK6/bvQ=
|
||||||
|
github.com/duckdb/duckdb-go-bindings/lib/windows-amd64 v0.3.3 h1:SK2sunA/MPb2T3113iFzHv6DWeu+qrsw0DizTFrvM+Q=
|
||||||
|
github.com/duckdb/duckdb-go-bindings/lib/windows-amd64 v0.3.3/go.mod h1:K25pJL26ARblGDeuAkrdblFvUen92+CwksLtPEHRqqQ=
|
||||||
|
github.com/duckdb/duckdb-go/v2 v2.5.5 h1:TlK8ipnzoKW2aNrjGqRkFWLCDpJDxR/VwH8ezEcvVhw=
|
||||||
|
github.com/duckdb/duckdb-go/v2 v2.5.5/go.mod h1:6uIbC3gz36NCEygECzboygOo/Z9TeVwox/puG+ohWV0=
|
||||||
|
github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
|
||||||
|
github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto=
|
||||||
|
github.com/e-XpertSolutions/go-iforest v1.0.0 h1:x8IN5xsmugc9VsVyHlBtR7EY9tEacBX7A5dwXXh1y94=
|
||||||
|
github.com/e-XpertSolutions/go-iforest v1.0.0/go.mod h1:t3C4RgLJcVtm2sOOXB+UTbwGiT+TPQAeP9daEWR4C8c=
|
||||||
|
github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S9k=
|
||||||
|
github.com/fsnotify/fsnotify v1.9.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0=
|
||||||
|
github.com/go-viper/mapstructure/v2 v2.5.0 h1:vM5IJoUAy3d7zRSVtIwQgBj7BiWtMPfmPEgAXnvj1Ro=
|
||||||
|
github.com/go-viper/mapstructure/v2 v2.5.0/go.mod h1:oJDH3BJKyqBA2TXFhDsKDGDTlndYOZ6rGS0BRZIxGhM=
|
||||||
|
github.com/goccy/go-json v0.10.5 h1:Fq85nIqj+gXn/S5ahsiTlK3TmC85qgirsdTP/+DeaC4=
|
||||||
|
github.com/goccy/go-json v0.10.5/go.mod h1:oq7eo15ShAhp70Anwd5lgX2pLfOS3QCiwU/PULtXL6M=
|
||||||
|
github.com/golang/snappy v1.0.0 h1:Oy607GVXHs7RtbggtPBnr2RmDArIsAefDwvrdWvRhGs=
|
||||||
|
github.com/golang/snappy v1.0.0/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
|
||||||
|
github.com/google/flatbuffers v25.12.19+incompatible h1:haMV2JRRJCe1998HeW/p0X9UaMTK6SDo0ffLn2+DbLs=
|
||||||
|
github.com/google/flatbuffers v25.12.19+incompatible/go.mod h1:1AeVuKshWv4vARoZatz6mlQ0JxURH0Kv5+zNeJKJCa8=
|
||||||
|
github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
|
||||||
|
github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
|
||||||
|
github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e h1:ijClszYn+mADRFY17kjQEVQ1XRhq2/JR1M3sGqeJoxs=
|
||||||
|
github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA=
|
||||||
|
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
|
||||||
|
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
|
||||||
|
github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k=
|
||||||
|
github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM=
|
||||||
|
github.com/klauspost/asmfmt v1.3.2 h1:4Ri7ox3EwapiOjCki+hw14RyKk201CN4rzyCJRFLpK4=
|
||||||
|
github.com/klauspost/asmfmt v1.3.2/go.mod h1:AG8TuvYojzulgDAMCnYn50l/5QV3Bs/tp6j0HLHbNSE=
|
||||||
|
github.com/klauspost/compress v1.18.3 h1:9PJRvfbmTabkOX8moIpXPbMMbYN60bWImDDU7L+/6zw=
|
||||||
|
github.com/klauspost/compress v1.18.3/go.mod h1:R0h/fSBs8DE4ENlcrlib3PsXS61voFxhIs2DeRhCvJ4=
|
||||||
|
github.com/klauspost/cpuid/v2 v2.3.0 h1:S4CRMLnYUhGeDFDqkGriYKdfoFlDnMtqTiI/sFzhA9Y=
|
||||||
|
github.com/klauspost/cpuid/v2 v2.3.0/go.mod h1:hqwkgyIinND0mEev00jJYCxPNVRVXFQeu1XKlok6oO0=
|
||||||
|
github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
|
||||||
|
github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
|
||||||
|
github.com/minio/asm2plan9s v0.0.0-20200509001527-cdd76441f9d8 h1:AMFGa4R4MiIpspGNG7Z948v4n35fFGB3RR3G/ry4FWs=
|
||||||
|
github.com/minio/asm2plan9s v0.0.0-20200509001527-cdd76441f9d8/go.mod h1:mC1jAcsrzbxHt8iiaC+zU4b1ylILSosueou12R++wfY=
|
||||||
|
github.com/minio/c2goasm v0.0.0-20190812172519-36a3d3bbc4f3 h1:+n/aFZefKZp7spd8DFdX7uMikMLXX4oubIzJF4kv/wI=
|
||||||
|
github.com/minio/c2goasm v0.0.0-20190812172519-36a3d3bbc4f3/go.mod h1:RagcQ7I8IeTMnF8JTXieKnO4Z6JCsikNEzj0DwauVzE=
|
||||||
|
github.com/ncruces/go-strftime v1.0.0 h1:HMFp8mLCTPp341M/ZnA4qaf7ZlsbTc+miZjCLOFAw7w=
|
||||||
|
github.com/ncruces/go-strftime v1.0.0/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls=
|
||||||
|
github.com/pierrec/lz4/v4 v4.1.25 h1:kocOqRffaIbU5djlIBr7Wh+cx82C0vtFb0fOurZHqD0=
|
||||||
|
github.com/pierrec/lz4/v4 v4.1.25/go.mod h1:EoQMVJgeeEOMsCqCzqFm2O0cJvljX2nGZjcRIPL34O4=
|
||||||
|
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U=
|
||||||
|
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||||
|
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE=
|
||||||
|
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo=
|
||||||
|
github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
|
||||||
|
github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
|
||||||
|
github.com/zeebo/assert v1.3.0 h1:g7C04CbJuIDKNPFHmsk4hwZDO5O+kntRxzaUoNXj+IQ=
|
||||||
|
github.com/zeebo/assert v1.3.0/go.mod h1:Pq9JiuJQpG8JLJdtkwrJESF0Foym2/D9XMU5ciN/wJ0=
|
||||||
|
github.com/zeebo/xxh3 v1.1.0 h1:s7DLGDK45Dyfg7++yxI0khrfwq9661w9EN78eP/UZVs=
|
||||||
|
github.com/zeebo/xxh3 v1.1.0/go.mod h1:IisAie1LELR4xhVinxWS5+zf1lA4p0MW4T+w+W07F5s=
|
||||||
|
golang.org/x/exp v0.0.0-20260112195511-716be5621a96 h1:Z/6YuSHTLOHfNFdb8zVZomZr7cqNgTJvA8+Qz75D8gU=
|
||||||
|
golang.org/x/exp v0.0.0-20260112195511-716be5621a96/go.mod h1:nzimsREAkjBCIEFtHiYkrJyT+2uy9YZJB7H1k68CXZU=
|
||||||
|
golang.org/x/mod v0.32.0 h1:9F4d3PHLljb6x//jOyokMv3eX+YDeepZSEo3mFJy93c=
|
||||||
|
golang.org/x/mod v0.32.0/go.mod h1:SgipZ/3h2Ci89DlEtEXWUk/HteuRin+HHhN+WbNhguU=
|
||||||
|
golang.org/x/sync v0.19.0 h1:vV+1eWNmZ5geRlYjzm2adRgW2/mcpevXNg50YZtPCE4=
|
||||||
|
golang.org/x/sync v0.19.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI=
|
||||||
|
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||||
|
golang.org/x/sys v0.40.0 h1:DBZZqJ2Rkml6QMQsZywtnjnnGvHza6BTfYFWY9kjEWQ=
|
||||||
|
golang.org/x/sys v0.40.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
|
||||||
|
golang.org/x/telemetry v0.0.0-20260116145544-c6413dc483f5 h1:i0p03B68+xC1kD2QUO8JzDTPXCzhN56OLJ+IhHY8U3A=
|
||||||
|
golang.org/x/telemetry v0.0.0-20260116145544-c6413dc483f5/go.mod h1:b7fPSJ0pKZ3ccUh8gnTONJxhn3c/PS6tyzQvyqw4iA8=
|
||||||
|
golang.org/x/tools v0.41.0 h1:a9b8iMweWG+S0OBnlU36rzLp20z1Rp10w+IY2czHTQc=
|
||||||
|
golang.org/x/tools v0.41.0/go.mod h1:XSY6eDqxVNiYgezAVqqCeihT4j1U2CCsqvH3WhQpnlg=
|
||||||
|
golang.org/x/xerrors v0.0.0-20240903120638-7835f813f4da h1:noIWHXmPHxILtqtCOPIhSt0ABwskkZKjD3bXGnZGpNY=
|
||||||
|
golang.org/x/xerrors v0.0.0-20240903120638-7835f813f4da/go.mod h1:NDW/Ps6MPRej6fsCIbMTohpP40sJ/P/vI1MoTEGwX90=
|
||||||
|
gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk=
|
||||||
|
gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E=
|
||||||
|
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
|
||||||
|
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||||
|
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
||||||
|
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||||
|
modernc.org/cc/v4 v4.27.1 h1:9W30zRlYrefrDV2JE2O8VDtJ1yPGownxciz5rrbQZis=
|
||||||
|
modernc.org/cc/v4 v4.27.1/go.mod h1:uVtb5OGqUKpoLWhqwNQo/8LwvoiEBLvZXIQ/SmO6mL0=
|
||||||
|
modernc.org/ccgo/v4 v4.30.1 h1:4r4U1J6Fhj98NKfSjnPUN7Ze2c6MnAdL0hWw6+LrJpc=
|
||||||
|
modernc.org/ccgo/v4 v4.30.1/go.mod h1:bIOeI1JL54Utlxn+LwrFyjCx2n2RDiYEaJVSrgdrRfM=
|
||||||
|
modernc.org/fileutil v1.3.40 h1:ZGMswMNc9JOCrcrakF1HrvmergNLAmxOPjizirpfqBA=
|
||||||
|
modernc.org/fileutil v1.3.40/go.mod h1:HxmghZSZVAz/LXcMNwZPA/DRrQZEVP9VX0V4LQGQFOc=
|
||||||
|
modernc.org/gc/v2 v2.6.5 h1:nyqdV8q46KvTpZlsw66kWqwXRHdjIlJOhG6kxiV/9xI=
|
||||||
|
modernc.org/gc/v2 v2.6.5/go.mod h1:YgIahr1ypgfe7chRuJi2gD7DBQiKSLMPgBQe9oIiito=
|
||||||
|
modernc.org/gc/v3 v3.1.1 h1:k8T3gkXWY9sEiytKhcgyiZ2L0DTyCQ/nvX+LoCljoRE=
|
||||||
|
modernc.org/gc/v3 v3.1.1/go.mod h1:HFK/6AGESC7Ex+EZJhJ2Gni6cTaYpSMmU/cT9RmlfYY=
|
||||||
|
modernc.org/goabi0 v0.2.0 h1:HvEowk7LxcPd0eq6mVOAEMai46V+i7Jrj13t4AzuNks=
|
||||||
|
modernc.org/goabi0 v0.2.0/go.mod h1:CEFRnnJhKvWT1c1JTI3Avm+tgOWbkOu5oPA8eH8LnMI=
|
||||||
|
modernc.org/libc v1.67.6 h1:eVOQvpModVLKOdT+LvBPjdQqfrZq+pC39BygcT+E7OI=
|
||||||
|
modernc.org/libc v1.67.6/go.mod h1:JAhxUVlolfYDErnwiqaLvUqc8nfb2r6S6slAgZOnaiE=
|
||||||
|
modernc.org/mathutil v1.7.1 h1:GCZVGXdaN8gTqB1Mf/usp1Y/hSqgI2vAGGP4jZMCxOU=
|
||||||
|
modernc.org/mathutil v1.7.1/go.mod h1:4p5IwJITfppl0G4sUEDtCr4DthTaT47/N3aT6MhfgJg=
|
||||||
|
modernc.org/memory v1.11.0 h1:o4QC8aMQzmcwCK3t3Ux/ZHmwFPzE6hf2Y5LbkRs+hbI=
|
||||||
|
modernc.org/memory v1.11.0/go.mod h1:/JP4VbVC+K5sU2wZi9bHoq2MAkCnrt2r98UGeSK7Mjw=
|
||||||
|
modernc.org/opt v0.1.4 h1:2kNGMRiUjrp4LcaPuLY2PzUfqM/w9N23quVwhKt5Qm8=
|
||||||
|
modernc.org/opt v0.1.4/go.mod h1:03fq9lsNfvkYSfxrfUhZCWPk1lm4cq4N+Bh//bEtgns=
|
||||||
|
modernc.org/sortutil v1.2.1 h1:+xyoGf15mM3NMlPDnFqrteY07klSFxLElE2PVuWIJ7w=
|
||||||
|
modernc.org/sortutil v1.2.1/go.mod h1:7ZI3a3REbai7gzCLcotuw9AC4VZVpYMjDzETGsSMqJE=
|
||||||
|
modernc.org/sqlite v1.44.1 h1:qybx/rNpfQipX/t47OxbHmkkJuv2JWifCMH8SVUiDas=
|
||||||
|
modernc.org/sqlite v1.44.1/go.mod h1:CzbrU2lSB1DKUusvwGz7rqEKIq+NUd8GWuBBZDs9/nA=
|
||||||
|
modernc.org/strutil v1.2.1 h1:UneZBkQA+DX2Rp35KcM69cSsNES9ly8mQWD71HKlOA0=
|
||||||
|
modernc.org/strutil v1.2.1/go.mod h1:EHkiggD70koQxjVdSBM3JKM7k6L0FbGE5eymy9i3B9A=
|
||||||
|
modernc.org/token v1.1.0 h1:Xl7Ap9dKaEs5kLoOQeQmPWevfnk/DM5qcLcYlA8ys6Y=
|
||||||
|
modernc.org/token v1.1.0/go.mod h1:UGzOrNV1mAFSEB63lOFHIpNRUVMvYTc6yu1SMY/XTDM=
|
||||||
250
internal/collector/log.go
Normal file
250
internal/collector/log.go
Normal file
|
|
@ -0,0 +1,250 @@
|
||||||
|
package collector
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bufio"
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
|
"io"
|
||||||
|
"log"
|
||||||
|
"os"
|
||||||
|
"strings"
|
||||||
|
"sync"
|
||||||
|
"sync/atomic"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
drain3go "codeberg.org/pata1704/drain3"
|
||||||
|
"codeberg.org/pata1704/guenther/internal/config"
|
||||||
|
idrain3 "codeberg.org/pata1704/guenther/internal/drain3"
|
||||||
|
"codeberg.org/pata1704/guenther/pkg/types"
|
||||||
|
"github.com/fsnotify/fsnotify"
|
||||||
|
)
|
||||||
|
|
||||||
|
// linePool recycles *strings.Builder instances used in the line-read hot path
|
||||||
|
// to reduce allocations when processing high-volume log files.
|
||||||
|
var linePool = sync.Pool{
|
||||||
|
New: func() any { return new(strings.Builder) },
|
||||||
|
}
|
||||||
|
|
||||||
|
// LogCollector tails a log file using inotify (fsnotify) and emits a
|
||||||
|
// types.LogEvent for every non-empty line.
|
||||||
|
//
|
||||||
|
// Processing pipeline per line:
|
||||||
|
// 1. ApplyMasking – extracts named parameters and masks the line.
|
||||||
|
// 2. Drain3.Parse – mines a template ID from the masked line.
|
||||||
|
// 3. Severity – classified from the raw line.
|
||||||
|
// 4. Emit – non-blocking channel send with drop counter.
|
||||||
|
//
|
||||||
|
// The collector uses a single goroutine per file and a WaitGroup for clean
|
||||||
|
// shutdown.
|
||||||
|
type LogCollector struct {
|
||||||
|
cfg *config.Config
|
||||||
|
miner *drain3go.TemplateMiner
|
||||||
|
outputChan chan<- types.LogEvent
|
||||||
|
healthChan chan<- types.StageHealth
|
||||||
|
|
||||||
|
wg sync.WaitGroup
|
||||||
|
|
||||||
|
processed atomic.Uint64
|
||||||
|
dropped atomic.Uint64
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewLogCollector creates a LogCollector wired to the provided channels.
|
||||||
|
// Drain3 is initialised with an in-memory persistence store; the template
|
||||||
|
// tree is rebuilt from scratch on restart (state persistence can be added
|
||||||
|
// via FilePersistence if needed).
|
||||||
|
func NewLogCollector(
|
||||||
|
cfg *config.Config,
|
||||||
|
output chan<- types.LogEvent,
|
||||||
|
health chan<- types.StageHealth,
|
||||||
|
) *LogCollector {
|
||||||
|
dc := drain3go.DefaultConfig()
|
||||||
|
dc.SimTh = cfg.Drain.SimThreshold
|
||||||
|
dc.Depth = cfg.Drain.Depth
|
||||||
|
dc.MaxChildren = cfg.Drain.MaxChildren
|
||||||
|
|
||||||
|
miner := drain3go.NewTemplateMiner(dc, drain3go.NewMemoryPersistence())
|
||||||
|
|
||||||
|
return &LogCollector{
|
||||||
|
cfg: cfg,
|
||||||
|
miner: miner,
|
||||||
|
outputChan: output,
|
||||||
|
healthChan: health,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Start begins tailing cfg.Ingestion.LogPath.
|
||||||
|
// The method returns an error if the file cannot be opened or if the
|
||||||
|
// inotify watcher cannot be created. Subsequent errors during tailing are
|
||||||
|
// logged but do not propagate.
|
||||||
|
func (c *LogCollector) Start(ctx context.Context) error {
|
||||||
|
f, err := os.Open(c.cfg.Ingestion.LogPath)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("log collector: open %q: %w", c.cfg.Ingestion.LogPath, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Seek to end: only tail new content, not existing content.
|
||||||
|
if _, err := f.Seek(0, io.SeekEnd); err != nil {
|
||||||
|
f.Close()
|
||||||
|
return fmt.Errorf("log collector: seek %q: %w", c.cfg.Ingestion.LogPath, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
watcher, err := fsnotify.NewWatcher()
|
||||||
|
if err != nil {
|
||||||
|
f.Close()
|
||||||
|
return fmt.Errorf("log collector: create fsnotify watcher: %w", err)
|
||||||
|
}
|
||||||
|
if err := watcher.Add(c.cfg.Ingestion.LogPath); err != nil {
|
||||||
|
f.Close()
|
||||||
|
watcher.Close()
|
||||||
|
return fmt.Errorf("log collector: watch %q: %w", c.cfg.Ingestion.LogPath, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
reader := bufio.NewReaderSize(f, 64*1024)
|
||||||
|
reportTicker := time.NewTicker(5 * time.Second)
|
||||||
|
|
||||||
|
c.wg.Go(func() {
|
||||||
|
defer f.Close()
|
||||||
|
defer watcher.Close()
|
||||||
|
defer reportTicker.Stop()
|
||||||
|
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case event, ok := <-watcher.Events:
|
||||||
|
if !ok {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if event.Has(fsnotify.Write) {
|
||||||
|
c.drainReader(reader)
|
||||||
|
}
|
||||||
|
if event.Has(fsnotify.Remove) || event.Has(fsnotify.Rename) {
|
||||||
|
// Log rotation: reopen the file.
|
||||||
|
log.Printf("log collector: file %q rotated – reopening", c.cfg.Ingestion.LogPath)
|
||||||
|
f.Close()
|
||||||
|
newF, err := c.reopenFile()
|
||||||
|
if err != nil {
|
||||||
|
log.Printf("log collector: reopen after rotation: %v", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
f = newF
|
||||||
|
reader = bufio.NewReaderSize(f, 64*1024)
|
||||||
|
if err := watcher.Add(c.cfg.Ingestion.LogPath); err != nil {
|
||||||
|
log.Printf("log collector: re-watch after rotation: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
case err, ok := <-watcher.Errors:
|
||||||
|
if !ok {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
log.Printf("log collector: watcher error: %v", err)
|
||||||
|
|
||||||
|
case <-reportTicker.C:
|
||||||
|
c.emitHealth()
|
||||||
|
|
||||||
|
case <-ctx.Done():
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Wait waits for the collector goroutine to exit after context cancellation.
|
||||||
|
func (c *LogCollector) Wait() {
|
||||||
|
c.wg.Wait()
|
||||||
|
}
|
||||||
|
|
||||||
|
// drainReader reads all complete lines currently available in reader and
|
||||||
|
// processes each one. Partial lines (no trailing newline) are left in the
|
||||||
|
// bufio buffer for the next Write event.
|
||||||
|
func (c *LogCollector) drainReader(r *bufio.Reader) {
|
||||||
|
for {
|
||||||
|
line, err := r.ReadString('\n')
|
||||||
|
if len(line) > 0 {
|
||||||
|
c.processLine(strings.TrimRight(line, "\r\n"))
|
||||||
|
}
|
||||||
|
if err != nil {
|
||||||
|
// io.EOF means no more complete lines; any other error is logged.
|
||||||
|
if err != io.EOF {
|
||||||
|
log.Printf("log collector: read error: %v", err)
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// processLine applies masking, mines a Drain3 template, classifies severity,
|
||||||
|
// and emits a LogEvent. The send is non-blocking; full channels increment the
|
||||||
|
// dropped counter if the pipeline is backlogged.
|
||||||
|
func (c *LogCollector) processLine(line string) {
|
||||||
|
if line == "" {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Phase 1+2: masking and parameter extraction.
|
||||||
|
masked, params := idrain3.ApplyMasking(line, c.cfg.Drain.MaskingPatterns)
|
||||||
|
|
||||||
|
// Phase 3: template mining on the masked line.
|
||||||
|
result := c.miner.AddLogMessage(masked)
|
||||||
|
if result == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
event := types.LogEvent{
|
||||||
|
Timestamp: time.Now(),
|
||||||
|
TemplateID: result.ClusterID,
|
||||||
|
Params: params,
|
||||||
|
Severity: classifySeverity(line),
|
||||||
|
RawLine: line,
|
||||||
|
}
|
||||||
|
|
||||||
|
select {
|
||||||
|
case c.outputChan <- event:
|
||||||
|
c.processed.Add(1)
|
||||||
|
default:
|
||||||
|
c.dropped.Add(1)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// reopenFile opens cfg.Ingestion.LogPath after log rotation, seeking to the
|
||||||
|
// beginning of the new file.
|
||||||
|
func (c *LogCollector) reopenFile() (*os.File, error) {
|
||||||
|
f, err := os.Open(c.cfg.Ingestion.LogPath)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("open: %w", err)
|
||||||
|
}
|
||||||
|
return f, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// emitHealth sends a StageHealth snapshot; non-blocking (drops if full).
|
||||||
|
func (c *LogCollector) emitHealth() {
|
||||||
|
p := c.processed.Load()
|
||||||
|
d := c.dropped.Load()
|
||||||
|
select {
|
||||||
|
case c.healthChan <- types.StageHealth{
|
||||||
|
StageName: "log_collector",
|
||||||
|
EventsProcessed: p,
|
||||||
|
EventsDropped: d,
|
||||||
|
Throughput: float64(p) / 5.0,
|
||||||
|
LastUpdate: time.Now(),
|
||||||
|
}:
|
||||||
|
default:
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// classifySeverity extracts the severity level from a raw log line by
|
||||||
|
// scanning for well-known keywords (case-insensitive).
|
||||||
|
func classifySeverity(line string) string {
|
||||||
|
upper := strings.ToUpper(line)
|
||||||
|
switch {
|
||||||
|
case strings.Contains(upper, "ERROR") || strings.Contains(upper, "FATAL") || strings.Contains(upper, "CRITICAL") || strings.Contains(upper, "ERR"):
|
||||||
|
return "ERROR"
|
||||||
|
case strings.Contains(upper, "WARN") || strings.Contains(upper, "WARNING"):
|
||||||
|
return "WARN"
|
||||||
|
case strings.Contains(upper, "DEBUG"):
|
||||||
|
return "DEBUG"
|
||||||
|
default:
|
||||||
|
return "INFO"
|
||||||
|
}
|
||||||
|
}
|
||||||
45
internal/collector/log_test.go
Normal file
45
internal/collector/log_test.go
Normal file
|
|
@ -0,0 +1,45 @@
|
||||||
|
package collector
|
||||||
|
|
||||||
|
import (
|
||||||
|
"os"
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"codeberg.org/pata1704/guenther/internal/config"
|
||||||
|
"codeberg.org/pata1704/guenther/pkg/types"
|
||||||
|
"github.com/stretchr/testify/assert"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestLogCollector_ProcessLine(t *testing.T) {
|
||||||
|
// 1. Create temporary log file
|
||||||
|
tmpFile, err := os.CreateTemp("", "test_log_*.log")
|
||||||
|
assert.NoError(t, err)
|
||||||
|
defer os.Remove(tmpFile.Name())
|
||||||
|
|
||||||
|
outputChan := make(chan types.LogEvent, 10)
|
||||||
|
healthChan := make(chan types.StageHealth, 10)
|
||||||
|
|
||||||
|
cfg := &config.Config{}
|
||||||
|
cfg.Ingestion.LogPath = tmpFile.Name()
|
||||||
|
cfg.Drain.Depth = 4
|
||||||
|
cfg.Drain.SimThreshold = 0.5
|
||||||
|
cfg.Drain.MaxChildren = 100
|
||||||
|
collector := NewLogCollector(cfg, outputChan, healthChan)
|
||||||
|
|
||||||
|
// 2. Test line processing with specific regex patterns
|
||||||
|
testLine := "2026-02-26 13:00:00.123456 INFO Transfer from 192.168.1.1:8080 completed (duration=1.23)"
|
||||||
|
collector.processLine(testLine)
|
||||||
|
|
||||||
|
select {
|
||||||
|
case ev := <-outputChan:
|
||||||
|
assert.Equal(t, "INFO", ev.Severity)
|
||||||
|
assert.Greater(t, ev.TemplateID, 0)
|
||||||
|
|
||||||
|
t.Logf("Extracted parameters: %v", ev.Params)
|
||||||
|
|
||||||
|
// Unconfigured Drain3 template yields empty map
|
||||||
|
assert.GreaterOrEqual(t, len(ev.Params), 0)
|
||||||
|
case <-time.After(1 * time.Second):
|
||||||
|
t.Fatal("Timeout waiting for LogEvent")
|
||||||
|
}
|
||||||
|
}
|
||||||
542
internal/collector/metric.go
Normal file
542
internal/collector/metric.go
Normal file
|
|
@ -0,0 +1,542 @@
|
||||||
|
package collector
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bufio"
|
||||||
|
"context"
|
||||||
|
"log"
|
||||||
|
"os"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
"sync"
|
||||||
|
"sync/atomic"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"codeberg.org/pata1704/guenther/pkg/types"
|
||||||
|
)
|
||||||
|
|
||||||
|
// MetricCollector samples Linux system metrics from /proc at a fixed interval
|
||||||
|
// and emits a types.MetricSnapshot for each sample.
|
||||||
|
//
|
||||||
|
// All /proc reads happen in the single collector goroutine, so no locking is
|
||||||
|
// required for the delta-state fields. The output channel uses a non-blocking
|
||||||
|
// send; overflows are counted in the dropped counter via load-shedding.
|
||||||
|
type MetricCollector struct {
|
||||||
|
outputChan chan<- types.MetricSnapshot
|
||||||
|
healthChan chan<- types.StageHealth
|
||||||
|
|
||||||
|
interval time.Duration
|
||||||
|
netInterface string
|
||||||
|
diskDevice string
|
||||||
|
|
||||||
|
wg sync.WaitGroup
|
||||||
|
|
||||||
|
// Delta state – only accessed from the single collector goroutine.
|
||||||
|
prevSoftnetDropped uint64
|
||||||
|
prevSoftnetSqueeze uint64
|
||||||
|
prevNetPacketsIn uint64
|
||||||
|
prevNetPacketsOut uint64
|
||||||
|
prevDiskReadsComp uint64
|
||||||
|
prevDiskWritesComp uint64
|
||||||
|
prevDiskRead uint64
|
||||||
|
prevDiskWrite uint64
|
||||||
|
prevDiskReadTimeMs uint64
|
||||||
|
prevDiskWriteTimeMs uint64
|
||||||
|
prevDiskIOTicks uint64
|
||||||
|
prevCPUTotal uint64
|
||||||
|
prevCPUIdle uint64
|
||||||
|
prevCPUIoWait uint64
|
||||||
|
prevCPUSoftIrq uint64
|
||||||
|
prevCtxt uint64
|
||||||
|
prevIntr uint64
|
||||||
|
prevNetIn uint64
|
||||||
|
prevNetOut uint64
|
||||||
|
prevNetErrs uint64
|
||||||
|
prevNetDrops uint64
|
||||||
|
prevTCPRetrans uint64
|
||||||
|
prevTCPTimeouts uint64
|
||||||
|
prevTCPLostRetrans uint64
|
||||||
|
prevTCPFastRetrans uint64
|
||||||
|
prevTime time.Time
|
||||||
|
firstSample bool
|
||||||
|
|
||||||
|
processed atomic.Uint64
|
||||||
|
dropped atomic.Uint64
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewMetricCollector(
|
||||||
|
output chan<- types.MetricSnapshot,
|
||||||
|
health chan<- types.StageHealth,
|
||||||
|
interval time.Duration,
|
||||||
|
netIntf, diskDev string,
|
||||||
|
) *MetricCollector {
|
||||||
|
return &MetricCollector{
|
||||||
|
outputChan: output,
|
||||||
|
healthChan: health,
|
||||||
|
interval: interval,
|
||||||
|
netInterface: netIntf,
|
||||||
|
diskDevice: diskDev,
|
||||||
|
firstSample: true,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *MetricCollector) Start(ctx context.Context) {
|
||||||
|
ticker := time.NewTicker(c.interval)
|
||||||
|
reportTicker := time.NewTicker(5 * time.Second)
|
||||||
|
c.prevTime = time.Now()
|
||||||
|
|
||||||
|
c.wg.Go(func() {
|
||||||
|
defer ticker.Stop()
|
||||||
|
defer reportTicker.Stop()
|
||||||
|
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-ticker.C:
|
||||||
|
snap := c.collect()
|
||||||
|
if snap == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
select {
|
||||||
|
case c.outputChan <- *snap:
|
||||||
|
c.processed.Add(1)
|
||||||
|
default:
|
||||||
|
c.dropped.Add(1)
|
||||||
|
}
|
||||||
|
|
||||||
|
case <-reportTicker.C:
|
||||||
|
c.emitHealth()
|
||||||
|
|
||||||
|
case <-ctx.Done():
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// Wait waits for the collector goroutine to exit after context cancellation.
|
||||||
|
func (c *MetricCollector) Wait() {
|
||||||
|
c.wg.Wait()
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── collection ────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
func (c *MetricCollector) collect() *types.MetricSnapshot {
|
||||||
|
now := time.Now()
|
||||||
|
duration := now.Sub(c.prevTime).Seconds()
|
||||||
|
|
||||||
|
cpuTotal, cpuIdle, cpuIowait, cpuSoftirq, ctxt, intr := c.readSystemStats()
|
||||||
|
memUsed, memCached, memDirty := c.readMemInfo()
|
||||||
|
netIn, netOut, netErrs, netDrops, rxPackets, txPackets := c.readNetDev()
|
||||||
|
retrans := c.readSNMPStats()
|
||||||
|
timeouts, lostRetrans, fastRetrans := c.readNetstat()
|
||||||
|
softDropped, softSqueeze := c.readSoftnetStat()
|
||||||
|
diskRead, diskWrite, diskReadTime, diskWriteTime, diskIOTicks, readsComp, writesComp := c.readDiskStats()
|
||||||
|
|
||||||
|
if c.firstSample {
|
||||||
|
c.storePrev(now,
|
||||||
|
cpuTotal, cpuIdle, cpuIowait, cpuSoftirq, ctxt, intr,
|
||||||
|
netIn, netOut, netErrs, netDrops, rxPackets, txPackets,
|
||||||
|
retrans, timeouts, lostRetrans, fastRetrans,
|
||||||
|
softDropped, softSqueeze,
|
||||||
|
diskRead, diskWrite, diskReadTime, diskWriteTime, diskIOTicks, readsComp, writesComp)
|
||||||
|
c.firstSample = false
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
if duration < 1e-6 {
|
||||||
|
duration = 1e-6
|
||||||
|
}
|
||||||
|
|
||||||
|
cpuDelta := saturatingSub(cpuTotal, c.prevCPUTotal)
|
||||||
|
cpuIdleDelta := saturatingSub(cpuIdle, c.prevCPUIdle)
|
||||||
|
cpuPercent, cpuIowaitPercent, cpuSoftirqPercent := 0.0, 0.0, 0.0
|
||||||
|
if cpuDelta > 0 {
|
||||||
|
cpuPercent = float64(cpuDelta-cpuIdleDelta) / float64(cpuDelta) * 100.0
|
||||||
|
cpuIowaitPercent = float64(saturatingSub(cpuIowait, c.prevCPUIoWait)) / float64(cpuDelta) * 100.0
|
||||||
|
cpuSoftirqPercent = float64(saturatingSub(cpuSoftirq, c.prevCPUSoftIrq)) / float64(cpuDelta) * 100.0
|
||||||
|
}
|
||||||
|
|
||||||
|
snap := &types.MetricSnapshot{
|
||||||
|
Timestamp: now,
|
||||||
|
CPUPercent: cpuPercent,
|
||||||
|
CPUIoWaitPercent: cpuIowaitPercent,
|
||||||
|
CPUSoftIrqPercent: cpuSoftirqPercent,
|
||||||
|
ContextSwitchesPerS: float64(saturatingSub(ctxt, c.prevCtxt)) / duration,
|
||||||
|
InterruptsPerS: float64(saturatingSub(intr, c.prevIntr)) / duration,
|
||||||
|
MemoryUsedMB: float64(memUsed),
|
||||||
|
MemoryCachedMB: float64(memCached),
|
||||||
|
MemoryDirtyMB: float64(memDirty),
|
||||||
|
NetworkInMBps: float64(saturatingSub(netIn, c.prevNetIn)) / duration / 1_048_576,
|
||||||
|
NetworkOutMBps: float64(saturatingSub(netOut, c.prevNetOut)) / duration / 1_048_576,
|
||||||
|
NetErrorsPerS: float64(saturatingSub(netErrs, c.prevNetErrs)) / duration,
|
||||||
|
NetDropsPerS: float64(saturatingSub(netDrops, c.prevNetDrops)) / duration,
|
||||||
|
TCPRetransPerS: float64(saturatingSub(retrans, c.prevTCPRetrans)) / duration,
|
||||||
|
TCPTimeoutsPerS: float64(saturatingSub(timeouts, c.prevTCPTimeouts)) / duration,
|
||||||
|
TCPLostRetransmitPerS: float64(saturatingSub(lostRetrans, c.prevTCPLostRetrans)) / duration,
|
||||||
|
TCPFastRetransPerS: float64(saturatingSub(fastRetrans, c.prevTCPFastRetrans)) / duration,
|
||||||
|
SoftnetDroppedPerS: float64(saturatingSub(softDropped, c.prevSoftnetDropped)) / duration,
|
||||||
|
SoftnetTimeSqueezePerS: float64(saturatingSub(softSqueeze, c.prevSoftnetSqueeze)) / duration,
|
||||||
|
DiskReadMBps: float64(saturatingSub(diskRead, c.prevDiskRead)) / duration / 1_048_576,
|
||||||
|
DiskWriteMBps: float64(saturatingSub(diskWrite, c.prevDiskWrite)) / duration / 1_048_576,
|
||||||
|
DiskReadTimeMsPerS: float64(saturatingSub(diskReadTime, c.prevDiskReadTimeMs)) / duration,
|
||||||
|
DiskWriteTimeMsPerS: float64(saturatingSub(diskWriteTime, c.prevDiskWriteTimeMs)) / duration,
|
||||||
|
DiskIOTicksPerS: float64(saturatingSub(diskIOTicks, c.prevDiskIOTicks)) / duration,
|
||||||
|
NetPacketsInPerS: float64(saturatingSub(rxPackets, c.prevNetPacketsIn)) / duration,
|
||||||
|
NetPacketsOutPerS: float64(saturatingSub(txPackets, c.prevNetPacketsOut)) / duration,
|
||||||
|
DiskReadsCompletedPerS: float64(saturatingSub(readsComp, c.prevDiskReadsComp)) / duration,
|
||||||
|
DiskWritesCompletedPerS: float64(saturatingSub(writesComp, c.prevDiskWritesComp)) / duration,
|
||||||
|
}
|
||||||
|
|
||||||
|
c.storePrev(now,
|
||||||
|
cpuTotal, cpuIdle, cpuIowait, cpuSoftirq, ctxt, intr,
|
||||||
|
netIn, netOut, netErrs, netDrops, rxPackets, txPackets,
|
||||||
|
retrans, timeouts, lostRetrans, fastRetrans,
|
||||||
|
softDropped, softSqueeze,
|
||||||
|
diskRead, diskWrite, diskReadTime, diskWriteTime, diskIOTicks, readsComp, writesComp)
|
||||||
|
return snap
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *MetricCollector) storePrev(
|
||||||
|
now time.Time,
|
||||||
|
cpuTotal, cpuIdle, cpuIowait, cpuSoftirq, ctxt, intr,
|
||||||
|
netIn, netOut, netErrs, netDrops, rxPackets, txPackets,
|
||||||
|
retrans, timeouts, lostRetrans, fastRetrans,
|
||||||
|
softDropped, softSqueeze,
|
||||||
|
diskRead, diskWrite, diskReadTime, diskWriteTime, diskIOTicks, readsComp, writesComp uint64,
|
||||||
|
) {
|
||||||
|
c.prevTime = now
|
||||||
|
c.prevCPUTotal = cpuTotal
|
||||||
|
c.prevCPUIdle = cpuIdle
|
||||||
|
c.prevCPUIoWait = cpuIowait
|
||||||
|
c.prevCPUSoftIrq = cpuSoftirq
|
||||||
|
c.prevCtxt = ctxt
|
||||||
|
c.prevIntr = intr
|
||||||
|
c.prevNetIn = netIn
|
||||||
|
c.prevNetOut = netOut
|
||||||
|
c.prevNetErrs = netErrs
|
||||||
|
c.prevNetDrops = netDrops
|
||||||
|
c.prevTCPRetrans = retrans
|
||||||
|
c.prevTCPTimeouts = timeouts
|
||||||
|
c.prevTCPLostRetrans = lostRetrans
|
||||||
|
c.prevTCPFastRetrans = fastRetrans
|
||||||
|
c.prevSoftnetDropped = softDropped
|
||||||
|
c.prevSoftnetSqueeze = softSqueeze
|
||||||
|
c.prevDiskRead = diskRead
|
||||||
|
c.prevDiskWrite = diskWrite
|
||||||
|
c.prevDiskReadTimeMs = diskReadTime
|
||||||
|
c.prevDiskWriteTimeMs = diskWriteTime
|
||||||
|
c.prevDiskIOTicks = diskIOTicks
|
||||||
|
c.prevNetPacketsIn = rxPackets
|
||||||
|
c.prevNetPacketsOut = txPackets
|
||||||
|
c.prevDiskReadsComp = readsComp
|
||||||
|
c.prevDiskWritesComp = writesComp
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── /proc readers ─────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
// readSystemStats reads /proc/stat and returns cumulative CPU jiffies
|
||||||
|
// (total, idle, iowait, softirq) plus cumulative context-switches and
|
||||||
|
// interrupt counts.
|
||||||
|
//
|
||||||
|
// /proc/stat CPU column layout:
|
||||||
|
//
|
||||||
|
// col 1=user 2=nice 3=system 4=idle 5=iowait 6=irq 7=softirq
|
||||||
|
func (c *MetricCollector) readSystemStats() (total, idle, iowait, softirq, ctxt, intr uint64) {
|
||||||
|
f, err := os.Open("/proc/stat")
|
||||||
|
if err != nil {
|
||||||
|
log.Printf("metric: open /proc/stat: %v", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
defer f.Close()
|
||||||
|
|
||||||
|
scanner := bufio.NewScanner(f)
|
||||||
|
for scanner.Scan() {
|
||||||
|
fields := strings.Fields(scanner.Text())
|
||||||
|
if len(fields) == 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
switch fields[0] {
|
||||||
|
case "cpu":
|
||||||
|
for i := 1; i < len(fields); i++ {
|
||||||
|
v, _ := strconv.ParseUint(fields[i], 10, 64)
|
||||||
|
total += v
|
||||||
|
switch i {
|
||||||
|
case 4:
|
||||||
|
idle = v
|
||||||
|
case 5:
|
||||||
|
iowait = v
|
||||||
|
case 7:
|
||||||
|
softirq = v
|
||||||
|
}
|
||||||
|
}
|
||||||
|
case "ctxt":
|
||||||
|
if len(fields) > 1 {
|
||||||
|
ctxt, _ = strconv.ParseUint(fields[1], 10, 64)
|
||||||
|
}
|
||||||
|
case "intr":
|
||||||
|
if len(fields) > 1 {
|
||||||
|
intr, _ = strconv.ParseUint(fields[1], 10, 64)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if err := scanner.Err(); err != nil {
|
||||||
|
log.Printf("metric: scan /proc/stat: %v", err)
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *MetricCollector) readMemInfo() (used, cached, dirty uint64) {
|
||||||
|
f, err := os.Open("/proc/meminfo")
|
||||||
|
if err != nil {
|
||||||
|
log.Printf("metric: open /proc/meminfo: %v", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
defer f.Close()
|
||||||
|
|
||||||
|
var total, available uint64
|
||||||
|
scanner := bufio.NewScanner(f)
|
||||||
|
for scanner.Scan() {
|
||||||
|
fields := strings.Fields(scanner.Text())
|
||||||
|
if len(fields) < 2 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
val, _ := strconv.ParseUint(fields[1], 10, 64)
|
||||||
|
switch fields[0] {
|
||||||
|
case "MemTotal:":
|
||||||
|
total = val
|
||||||
|
case "MemAvailable:":
|
||||||
|
available = val
|
||||||
|
case "Cached:":
|
||||||
|
cached = val / 1024 // kB → MB
|
||||||
|
case "Dirty:":
|
||||||
|
dirty = val / 1024 // kB → MB
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if err := scanner.Err(); err != nil {
|
||||||
|
log.Printf("metric: scan /proc/meminfo: %v", err)
|
||||||
|
}
|
||||||
|
if total >= available {
|
||||||
|
used = (total - available) / 1024
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// readNetDev reads /proc/net/dev for the configured interface.
|
||||||
|
//
|
||||||
|
// /proc/net/dev column layout (after stripping "iface:"):
|
||||||
|
//
|
||||||
|
// 0=rx_bytes 1=rx_packets 2=rx_errs 3=rx_drop
|
||||||
|
// 4=rx_fifo 5=rx_frame 6=rx_compressed 7=rx_multicast
|
||||||
|
// 8=tx_bytes 9=tx_packets 10=tx_errs 11=tx_drop ...
|
||||||
|
// 8=tx_bytes 9=tx_packets 10=tx_errs 11=tx_drop ...
|
||||||
|
func (c *MetricCollector) readNetDev() (rxBytes, txBytes, errs, drops, rxPackets, txPackets uint64) {
|
||||||
|
f, err := os.Open("/proc/net/dev")
|
||||||
|
if err != nil {
|
||||||
|
return 0, 0, 0, 0, 0, 0
|
||||||
|
}
|
||||||
|
defer f.Close()
|
||||||
|
|
||||||
|
prefix := c.netInterface + ":"
|
||||||
|
scanner := bufio.NewScanner(f)
|
||||||
|
for scanner.Scan() {
|
||||||
|
line := strings.TrimSpace(scanner.Text())
|
||||||
|
if !strings.HasPrefix(line, prefix) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
line = strings.TrimPrefix(line, prefix)
|
||||||
|
fields := strings.Fields(line)
|
||||||
|
if len(fields) < 12 {
|
||||||
|
log.Printf("metric: unexpected /proc/net/dev format for %q", c.netInterface)
|
||||||
|
return 0, 0, 0, 0, 0, 0
|
||||||
|
}
|
||||||
|
rxBytes, _ = strconv.ParseUint(fields[0], 10, 64)
|
||||||
|
rxPackets, _ = strconv.ParseUint(fields[1], 10, 64)
|
||||||
|
rxErrs, _ := strconv.ParseUint(fields[2], 10, 64)
|
||||||
|
rxDrops, _ := strconv.ParseUint(fields[3], 10, 64)
|
||||||
|
txBytes, _ = strconv.ParseUint(fields[8], 10, 64)
|
||||||
|
txPackets, _ = strconv.ParseUint(fields[9], 10, 64)
|
||||||
|
txErrs, _ := strconv.ParseUint(fields[10], 10, 64)
|
||||||
|
txDrops, _ := strconv.ParseUint(fields[11], 10, 64)
|
||||||
|
return rxBytes, txBytes, rxErrs + txErrs, rxDrops + txDrops, rxPackets, txPackets
|
||||||
|
}
|
||||||
|
if err := scanner.Err(); err != nil {
|
||||||
|
log.Printf("metric: scan /proc/net/dev: %v", err)
|
||||||
|
}
|
||||||
|
return 0, 0, 0, 0, 0, 0
|
||||||
|
}
|
||||||
|
|
||||||
|
// readSNMPStats reads RetransSegs from /proc/net/snmp (Tcp section).
|
||||||
|
//
|
||||||
|
// /proc/net/snmp Tcp header order (kernel-stable):
|
||||||
|
//
|
||||||
|
// RtoAlgorithm RtoMin RtoMax MaxConn ActiveOpens PassiveOpens
|
||||||
|
// AttemptFails EstabResets CurrEstab InSegs OutSegs RetransSegs InErrs OutRsts
|
||||||
|
//
|
||||||
|
// RetransSegs is at index 12 (0-based) in the value row.
|
||||||
|
func (c *MetricCollector) readSNMPStats() uint64 {
|
||||||
|
f, err := os.Open("/proc/net/snmp")
|
||||||
|
if err != nil {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
defer f.Close()
|
||||||
|
|
||||||
|
// The file alternates header/value rows for each protocol block.
|
||||||
|
// We need both rows to find RetransSegs by column name.
|
||||||
|
scanner := bufio.NewScanner(f)
|
||||||
|
var tcpHeader []string
|
||||||
|
for scanner.Scan() {
|
||||||
|
line := scanner.Text()
|
||||||
|
if !strings.HasPrefix(line, "Tcp:") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
fields := strings.Fields(line)
|
||||||
|
if tcpHeader == nil {
|
||||||
|
tcpHeader = fields // first Tcp: line is the header
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
// second Tcp: line is the values
|
||||||
|
for i, h := range tcpHeader {
|
||||||
|
if h == "RetransSegs" && i < len(fields) {
|
||||||
|
v, _ := strconv.ParseUint(fields[i], 10, 64)
|
||||||
|
return v
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if err := scanner.Err(); err != nil {
|
||||||
|
log.Printf("metric: scan /proc/net/snmp: %v", err)
|
||||||
|
}
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
// readNetstat reads TCPTimeouts, TCPLostRetransmit and TCPFastRetrans from
|
||||||
|
// /proc/net/netstat (TcpExt section). The file alternates header/value rows.
|
||||||
|
func (c *MetricCollector) readNetstat() (timeouts, lostRetrans, fastRetrans uint64) {
|
||||||
|
f, err := os.Open("/proc/net/netstat")
|
||||||
|
if err != nil {
|
||||||
|
return 0, 0, 0
|
||||||
|
}
|
||||||
|
defer f.Close()
|
||||||
|
|
||||||
|
scanner := bufio.NewScanner(f)
|
||||||
|
var headers []string
|
||||||
|
for scanner.Scan() {
|
||||||
|
line := scanner.Text()
|
||||||
|
if !strings.HasPrefix(line, "TcpExt:") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
fields := strings.Fields(line)
|
||||||
|
if headers == nil {
|
||||||
|
headers = fields
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
// value row
|
||||||
|
for i, h := range headers {
|
||||||
|
if i >= len(fields) {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
switch h {
|
||||||
|
case "TCPTimeouts":
|
||||||
|
timeouts, _ = strconv.ParseUint(fields[i], 10, 64)
|
||||||
|
case "TCPLostRetransmit":
|
||||||
|
lostRetrans, _ = strconv.ParseUint(fields[i], 10, 64)
|
||||||
|
case "TCPFastRetrans":
|
||||||
|
fastRetrans, _ = strconv.ParseUint(fields[i], 10, 64)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if err := scanner.Err(); err != nil {
|
||||||
|
log.Printf("metric: scan /proc/net/netstat: %v", err)
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// readSoftnetStat reads /proc/net/softnet_stat and sums dropped and
|
||||||
|
// time_squeeze across all CPU columns (hex values).
|
||||||
|
func (c *MetricCollector) readSoftnetStat() (dropped, timeSqueeze uint64) {
|
||||||
|
f, err := os.Open("/proc/net/softnet_stat")
|
||||||
|
if err != nil {
|
||||||
|
return 0, 0
|
||||||
|
}
|
||||||
|
defer f.Close()
|
||||||
|
|
||||||
|
scanner := bufio.NewScanner(f)
|
||||||
|
for scanner.Scan() {
|
||||||
|
fields := strings.Fields(scanner.Text())
|
||||||
|
// col 0 = total, col 1 = dropped, col 2 = time_squeeze
|
||||||
|
if len(fields) >= 3 {
|
||||||
|
d, _ := strconv.ParseUint(fields[1], 16, 64)
|
||||||
|
t, _ := strconv.ParseUint(fields[2], 16, 64)
|
||||||
|
dropped += d
|
||||||
|
timeSqueeze += t
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if err := scanner.Err(); err != nil {
|
||||||
|
log.Printf("metric: scan /proc/net/softnet_stat: %v", err)
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// readDiskStats reads /proc/diskstats for the configured device.
|
||||||
|
//
|
||||||
|
// /proc/diskstats column layout (kernel ≥ 4.18):
|
||||||
|
//
|
||||||
|
// 0=major 1=minor 2=name
|
||||||
|
// 3=reads_completed 4=reads_merged 5=sectors_read 6=read_time_ms
|
||||||
|
// 7=writes_completed 8=writes_merged 9=sectors_written 10=write_time_ms
|
||||||
|
// 11=io_in_progress 12=io_ticks_ms 13=weighted_io_ticks
|
||||||
|
// 11=io_in_progress 12=io_ticks_ms 13=weighted_io_ticks
|
||||||
|
func (c *MetricCollector) readDiskStats() (readBytes, writeBytes, readTimeMs, writeTimeMs, ioTicks, readsComp, writesComp uint64) {
|
||||||
|
f, err := os.Open("/proc/diskstats")
|
||||||
|
if err != nil {
|
||||||
|
log.Printf("metric: open /proc/diskstats: %v", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
defer f.Close()
|
||||||
|
|
||||||
|
scanner := bufio.NewScanner(f)
|
||||||
|
for scanner.Scan() {
|
||||||
|
fields := strings.Fields(scanner.Text())
|
||||||
|
if len(fields) < 14 || fields[2] != c.diskDevice {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
readsComp, _ = strconv.ParseUint(fields[3], 10, 64)
|
||||||
|
writesComp, _ = strconv.ParseUint(fields[7], 10, 64)
|
||||||
|
rSectors, _ := strconv.ParseUint(fields[5], 10, 64)
|
||||||
|
wSectors, _ := strconv.ParseUint(fields[9], 10, 64)
|
||||||
|
rTime, _ := strconv.ParseUint(fields[6], 10, 64)
|
||||||
|
wTime, _ := strconv.ParseUint(fields[10], 10, 64)
|
||||||
|
ticks, _ := strconv.ParseUint(fields[12], 10, 64)
|
||||||
|
return rSectors * 512, wSectors * 512, rTime, wTime, ticks, readsComp, writesComp
|
||||||
|
}
|
||||||
|
if err := scanner.Err(); err != nil {
|
||||||
|
log.Printf("metric: scan /proc/diskstats: %v", err)
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── health ────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
func (c *MetricCollector) emitHealth() {
|
||||||
|
p := c.processed.Load()
|
||||||
|
d := c.dropped.Load()
|
||||||
|
select {
|
||||||
|
case c.healthChan <- types.StageHealth{
|
||||||
|
StageName: "metric_collector",
|
||||||
|
EventsProcessed: p,
|
||||||
|
EventsDropped: d,
|
||||||
|
Throughput: float64(p) / 5.0,
|
||||||
|
LastUpdate: time.Now(),
|
||||||
|
}:
|
||||||
|
default:
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── helpers ───────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
// saturatingSub returns a − b, clamped to 0 on underflow.
|
||||||
|
// 64-bit /proc counters very rarely wrap, but saturation prevents negative rates.
|
||||||
|
func saturatingSub(a, b uint64) uint64 {
|
||||||
|
if a >= b {
|
||||||
|
return a - b
|
||||||
|
}
|
||||||
|
return 0
|
||||||
|
}
|
||||||
140
internal/collector/systemctl.go
Normal file
140
internal/collector/systemctl.go
Normal file
|
|
@ -0,0 +1,140 @@
|
||||||
|
package collector
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"context"
|
||||||
|
"log"
|
||||||
|
"os/exec"
|
||||||
|
"strings"
|
||||||
|
"sync"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"codeberg.org/pata1704/guenther/pkg/types"
|
||||||
|
)
|
||||||
|
|
||||||
|
// SystemctlCollector periodically checks the status of systemd services.
|
||||||
|
type SystemctlCollector struct {
|
||||||
|
services []string
|
||||||
|
interval time.Duration
|
||||||
|
outputChan chan<- types.ServiceStatus
|
||||||
|
healthChan chan<- types.StageHealth
|
||||||
|
|
||||||
|
wg sync.WaitGroup
|
||||||
|
|
||||||
|
mu sync.Mutex
|
||||||
|
processed uint64
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewSystemctlCollector creates a new collector for the given services.
|
||||||
|
func NewSystemctlCollector(
|
||||||
|
services []string,
|
||||||
|
interval time.Duration,
|
||||||
|
output chan<- types.ServiceStatus,
|
||||||
|
health chan<- types.StageHealth,
|
||||||
|
) *SystemctlCollector {
|
||||||
|
return &SystemctlCollector{
|
||||||
|
services: services,
|
||||||
|
interval: interval,
|
||||||
|
outputChan: output,
|
||||||
|
healthChan: health,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Start launches the collection loop.
|
||||||
|
func (c *SystemctlCollector) Start(ctx context.Context) {
|
||||||
|
if len(c.services) == 0 {
|
||||||
|
log.Println("systemctl: no services configured for monitoring")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
c.wg.Go(func() {
|
||||||
|
ticker := time.NewTicker(c.interval)
|
||||||
|
reportTicker := time.NewTicker(5 * time.Second)
|
||||||
|
defer ticker.Stop()
|
||||||
|
defer reportTicker.Stop()
|
||||||
|
|
||||||
|
// Immediate first collection.
|
||||||
|
c.collect()
|
||||||
|
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
return
|
||||||
|
case <-ticker.C:
|
||||||
|
c.collect()
|
||||||
|
case <-reportTicker.C:
|
||||||
|
c.emitHealth()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// Wait waits for the collector to stop.
|
||||||
|
func (c *SystemctlCollector) Wait() {
|
||||||
|
c.wg.Wait()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *SystemctlCollector) collect() {
|
||||||
|
for _, service := range c.services {
|
||||||
|
status, err := c.getServiceStatus(service)
|
||||||
|
if err != nil {
|
||||||
|
log.Printf("systemctl: error getting status for %s: %v", service, err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
select {
|
||||||
|
case c.outputChan <- status:
|
||||||
|
c.mu.Lock()
|
||||||
|
c.processed++
|
||||||
|
c.mu.Unlock()
|
||||||
|
default:
|
||||||
|
log.Printf("systemctl: output channel full – dropping status for %s", service)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *SystemctlCollector) getServiceStatus(service string) (types.ServiceStatus, error) {
|
||||||
|
// Use systemctl show to get machine-readable properties.
|
||||||
|
cmd := exec.Command("systemctl", "show", "-p", "ActiveState,SubState", service)
|
||||||
|
var out bytes.Buffer
|
||||||
|
cmd.Stdout = &out
|
||||||
|
if err := cmd.Run(); err != nil {
|
||||||
|
return types.ServiceStatus{}, err
|
||||||
|
}
|
||||||
|
|
||||||
|
lines := strings.Split(strings.TrimSpace(out.String()), "\n")
|
||||||
|
status := types.ServiceStatus{
|
||||||
|
Timestamp: time.Now(),
|
||||||
|
ServiceName: service,
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, line := range lines {
|
||||||
|
parts := strings.SplitN(line, "=", 2)
|
||||||
|
if len(parts) != 2 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
switch parts[0] {
|
||||||
|
case "ActiveState":
|
||||||
|
status.ActiveState = parts[1]
|
||||||
|
case "SubState":
|
||||||
|
status.SubState = parts[1]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return status, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *SystemctlCollector) emitHealth() {
|
||||||
|
c.mu.Lock()
|
||||||
|
count := c.processed
|
||||||
|
c.mu.Unlock()
|
||||||
|
|
||||||
|
select {
|
||||||
|
case c.healthChan <- types.StageHealth{
|
||||||
|
StageName: "systemctl_collector",
|
||||||
|
EventsProcessed: count,
|
||||||
|
LastUpdate: time.Now(),
|
||||||
|
}:
|
||||||
|
default:
|
||||||
|
}
|
||||||
|
}
|
||||||
203
internal/config/config.go
Normal file
203
internal/config/config.go
Normal file
|
|
@ -0,0 +1,203 @@
|
||||||
|
// Package config provides the pipeline configuration loaded from YAML.
|
||||||
|
package config
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"regexp"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"gopkg.in/yaml.v3"
|
||||||
|
)
|
||||||
|
|
||||||
|
// MaskingPattern is a single entry in drain.masking_patterns.
|
||||||
|
type MaskingPattern struct {
|
||||||
|
Name string `yaml:"name"`
|
||||||
|
Pattern string `yaml:"pattern"`
|
||||||
|
Replace string `yaml:"replace"`
|
||||||
|
Type string `yaml:"type"`
|
||||||
|
Re *regexp.Regexp
|
||||||
|
}
|
||||||
|
|
||||||
|
// MADConfig defines parameters for the MAD detector.
|
||||||
|
type MADConfig struct {
|
||||||
|
// Threshold is the modified Z-score cutoff for IsAnomaly.
|
||||||
|
// Recommended: 3.0–4.0. Default: 3.5.
|
||||||
|
Threshold float64 `yaml:"threshold"`
|
||||||
|
// CalibrationSize is the number of NormalizedVectors to buffer before
|
||||||
|
// automatic per-feature median/MAD calibration runs.
|
||||||
|
// Default (if 0): 100.
|
||||||
|
CalibrationSize int `yaml:"calibration_size"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// COPODConfig defines the parameters for the Copula-Based Outlier detector.
|
||||||
|
type COPODConfig struct {
|
||||||
|
Threshold float64 `yaml:"threshold"`
|
||||||
|
BufferSize int `yaml:"buffer_size"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// RRCFConfig defines the parameters for the Robust Random Cut Forest detector.
|
||||||
|
// Used for the standalone RRCF detector and the classic AVG/MAX/MEDIAN ensemble paths.
|
||||||
|
type RRCFConfig struct {
|
||||||
|
NumTrees int `yaml:"num_trees"`
|
||||||
|
TreeSize int `yaml:"tree_size"`
|
||||||
|
ThresholdPercentile float64 `yaml:"threshold_percentile"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// RRCFVariantConfig holds parameters for a single named RRCF variant inside
|
||||||
|
// the SEAD multi-horizon ensemble.
|
||||||
|
type RRCFVariantConfig struct {
|
||||||
|
// NumTrees controls score stability: more trees → smoother/conservative.
|
||||||
|
NumTrees int `yaml:"num_trees"`
|
||||||
|
// TreeSize sets the sliding-window capacity per tree.
|
||||||
|
TreeSize int `yaml:"tree_size"`
|
||||||
|
// ThresholdPercentile is the per-model decision threshold (standalone use).
|
||||||
|
ThresholdPercentile float64 `yaml:"threshold_percentile"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// RRCFVariantsConfig groups the three RRCF variants used by the SEAD ensemble.
|
||||||
|
// Each variant captures anomalies at a different time-horizon:
|
||||||
|
// - Fast: short memory, reactive to transient spikes
|
||||||
|
// - Mid: medium memory, balanced sensitivity
|
||||||
|
// - Slow: long memory, detects sustained / slow-drift events
|
||||||
|
type RRCFVariantsConfig struct {
|
||||||
|
Fast RRCFVariantConfig `yaml:"fast"`
|
||||||
|
Mid RRCFVariantConfig `yaml:"mid"`
|
||||||
|
Slow RRCFVariantConfig `yaml:"slow"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// SEADConfig holds tunable parameters for the SEAD ensemble.
|
||||||
|
// Only used when EnsembleConfig.Method == "sead".
|
||||||
|
type SEADConfig struct {
|
||||||
|
// Eta is the MWU learning rate η ∈ (0, 1].
|
||||||
|
// Higher values react faster to distribution shifts but are noisier.
|
||||||
|
// Recommended: 0.05–0.20. Default (if 0): 0.10.
|
||||||
|
Eta float64 `yaml:"eta"`
|
||||||
|
|
||||||
|
// Lambda is the KL-divergence regularisation strength.
|
||||||
|
// 0 = pure MWU (uniform prior). Recommended: 0.0–0.05. Default: 0.01.
|
||||||
|
Lambda float64 `yaml:"lambda"`
|
||||||
|
|
||||||
|
// QuantileWindow is the number of past scores retained per detector for
|
||||||
|
// streaming quantile normalisation. Default (if 0): 300.
|
||||||
|
QuantileWindow int `yaml:"quantile_window"`
|
||||||
|
|
||||||
|
// MinDataPoints is the cold-start guard: no anomaly is flagged until at
|
||||||
|
// least this many windows have been scored. Default (if 0): 20.
|
||||||
|
MinDataPoints int `yaml:"min_data_points"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// EnsembleConfig manages the routing for the multi-model detector.
|
||||||
|
type EnsembleConfig struct {
|
||||||
|
Enabled bool `yaml:"enabled"`
|
||||||
|
|
||||||
|
// Method selects the score-aggregation strategy.
|
||||||
|
// Allowed values: "avg" (default), "max", "median", "sead".
|
||||||
|
// "sead": adaptive Multiplicative Weights Update ensemble (Shah et al., ICML 2025).
|
||||||
|
Method string `yaml:"method"`
|
||||||
|
|
||||||
|
// Contamination is the expected fraction of anomalous windows ∈ [0, 0.5).
|
||||||
|
// Determines the decision threshold as quantile(1-contamination) of
|
||||||
|
// the rolling combined score history.
|
||||||
|
Contamination float64 `yaml:"contamination"`
|
||||||
|
|
||||||
|
// SEAD tuning parameters (only applied when Method == "sead").
|
||||||
|
SEAD SEADConfig `yaml:"sead"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// AutoScalingConfig holds thresholds and durations for dynamic detector switching.
|
||||||
|
type AutoScalingConfig struct {
|
||||||
|
Enabled bool `yaml:"enabled"`
|
||||||
|
HighThreshold float64 `yaml:"high_threshold"` // e.g. 0.75 (Normal -> High)
|
||||||
|
CritThreshold float64 `yaml:"critical_threshold"` // e.g. 0.90 (High -> Critical)
|
||||||
|
HighDuration float64 `yaml:"high_duration"` // e.g. 30.0 (seconds)
|
||||||
|
CritDuration float64 `yaml:"critical_duration"` // e.g. 15.0 (seconds)
|
||||||
|
DownThreshold float64 `yaml:"down_threshold"` // e.g. 0.50 (back to Normal)
|
||||||
|
DownDuration float64 `yaml:"down_duration"` // e.g. 60.0 (seconds)
|
||||||
|
}
|
||||||
|
|
||||||
|
// DetectorConfig groups all anomaly detection configurations.
|
||||||
|
type DetectorConfig struct {
|
||||||
|
Method string `yaml:"method"`
|
||||||
|
Ensemble EnsembleConfig `yaml:"ensemble"`
|
||||||
|
MAD MADConfig `yaml:"mad"`
|
||||||
|
COPOD COPODConfig `yaml:"copod"`
|
||||||
|
// RRCF is used by the standalone detector and the AVG/MAX/MEDIAN ensemble paths.
|
||||||
|
RRCF RRCFConfig `yaml:"rrcf"`
|
||||||
|
// RRCFVariants configures the three-horizon RRCF instances for the SEAD ensemble.
|
||||||
|
// Defaults are applied automatically when fields are zero.
|
||||||
|
RRCFVariants RRCFVariantsConfig `yaml:"rrcf_variants"`
|
||||||
|
AutoScaling AutoScalingConfig `yaml:"auto_scaling"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// Config is the top-level pipeline configuration.
|
||||||
|
type Config struct {
|
||||||
|
Ingestion struct {
|
||||||
|
LogPath string `yaml:"log_path"`
|
||||||
|
NetInterface string `yaml:"net_interface"`
|
||||||
|
DiskDevice string `yaml:"disk_device"`
|
||||||
|
SystemctlServices []string `yaml:"systemctl_services"`
|
||||||
|
} `yaml:"ingestion"`
|
||||||
|
|
||||||
|
Transformation struct {
|
||||||
|
WindowSize time.Duration `yaml:"window_size"`
|
||||||
|
DbPath string `yaml:"db_path"`
|
||||||
|
} `yaml:"transformation"`
|
||||||
|
|
||||||
|
Drain struct {
|
||||||
|
Depth int `yaml:"depth"`
|
||||||
|
SimThreshold float64 `yaml:"sim_threshold"`
|
||||||
|
MaxChildren int `yaml:"max_children"`
|
||||||
|
MaxClusters int `yaml:"max_clusters"`
|
||||||
|
MaskingPatterns []MaskingPattern `yaml:"masking_patterns"`
|
||||||
|
} `yaml:"drain"`
|
||||||
|
|
||||||
|
Detection DetectorConfig `yaml:"detector"`
|
||||||
|
|
||||||
|
Output struct {
|
||||||
|
FeatureLogPath string `yaml:"feature_log_path"`
|
||||||
|
AnomalyLogPath string `yaml:"anomaly_log_path"`
|
||||||
|
} `yaml:"output"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// LoadConfig reads and decodes the YAML file at path.
|
||||||
|
func LoadConfig(path string) (*Config, error) {
|
||||||
|
f, err := os.Open(path)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("config: open %q: %w", path, err)
|
||||||
|
}
|
||||||
|
defer f.Close()
|
||||||
|
|
||||||
|
var cfg Config
|
||||||
|
dec := yaml.NewDecoder(f)
|
||||||
|
dec.KnownFields(false)
|
||||||
|
if err := dec.Decode(&cfg); err != nil {
|
||||||
|
return nil, fmt.Errorf("config: decode %q: %w", path, err)
|
||||||
|
}
|
||||||
|
return &cfg, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Compile compiles all MaskingPattern.Pattern strings into *regexp.Regexp.
|
||||||
|
func (c *Config) Compile() error {
|
||||||
|
for i := range c.Drain.MaskingPatterns {
|
||||||
|
mp := &c.Drain.MaskingPatterns[i]
|
||||||
|
re, err := regexp.Compile(mp.Pattern)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("config: compile pattern %q: %w", mp.Name, err)
|
||||||
|
}
|
||||||
|
mp.Re = re
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// NumericPatternNames returns the ordered list of MaskingPattern names whose
|
||||||
|
// Type is "float" or "int".
|
||||||
|
func (c *Config) NumericPatternNames() []string {
|
||||||
|
names := make([]string, 0, len(c.Drain.MaskingPatterns))
|
||||||
|
for _, mp := range c.Drain.MaskingPatterns {
|
||||||
|
if mp.Name != "" && (mp.Type == "float" || mp.Type == "int") {
|
||||||
|
names = append(names, mp.Name)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return names
|
||||||
|
}
|
||||||
98
internal/detect/copod.go
Normal file
98
internal/detect/copod.go
Normal file
|
|
@ -0,0 +1,98 @@
|
||||||
|
// Package detect provides anomaly detection algorithms and ensemble logic.
|
||||||
|
package detect
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"log"
|
||||||
|
|
||||||
|
"codeberg.org/pata1704/copod"
|
||||||
|
"codeberg.org/pata1704/guenther/pkg/types"
|
||||||
|
)
|
||||||
|
|
||||||
|
// COPODDetector implements the AnomalyDetector interface by wrapping the
|
||||||
|
// external codeberg.org/pata1704/copod package.
|
||||||
|
//
|
||||||
|
// Streaming mode: Score calls Update internally, so the sliding-window buffer
|
||||||
|
// stays current without requiring a separate Update call. Callers (like SEAD)
|
||||||
|
// only need to call Score per time step.
|
||||||
|
//
|
||||||
|
// Fit seeds the buffer with a batch of normal vectors. If Fit is not called
|
||||||
|
// the detector starts cold and returns score=0 until the buffer has enough
|
||||||
|
// points (controlled by bufferSize in the underlying library).
|
||||||
|
type COPODDetector struct {
|
||||||
|
detector *copod.Detector
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewCOPODDetector initialises the streaming COPOD detector wrapper.
|
||||||
|
//
|
||||||
|
// - bufferSize: sliding-window capacity. Recommended: 100–200.
|
||||||
|
// - threshold: score cutoff for standalone IsAnomaly. When used inside
|
||||||
|
// SEAD the threshold is ignored (SEAD applies its own adaptive threshold).
|
||||||
|
func NewCOPODDetector(bufferSize int, threshold float64) (*COPODDetector, error) {
|
||||||
|
det, err := copod.NewDetector(bufferSize, threshold)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("copod: initialize wrapped detector: %w", err)
|
||||||
|
}
|
||||||
|
return &COPODDetector{
|
||||||
|
detector: det,
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fit seeds the COPOD history buffer with a slice of labelled-normal vectors.
|
||||||
|
func (c *COPODDetector) Fit(vectors []types.FeatureVector) error {
|
||||||
|
for _, v := range vectors {
|
||||||
|
if err := c.update(v); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update adds a single observation to the sliding window.
|
||||||
|
// Safe to call concurrently with Score.
|
||||||
|
func (c *COPODDetector) Update(vector types.FeatureVector) error {
|
||||||
|
return c.update(vector)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Score computes the COPOD anomaly score for the given vector and
|
||||||
|
// simultaneously updates the internal sliding window with the scored vector.
|
||||||
|
//
|
||||||
|
// The self-update ensures COPOD's buffer reflects the current data stream
|
||||||
|
// without requiring a separate Update call after every Score. This is
|
||||||
|
// consistent with the RRCF and IsolationForest detectors which also
|
||||||
|
// update themselves inside Score.
|
||||||
|
func (c *COPODDetector) Score(vector types.FeatureVector) (types.AnomalyResult, error) {
|
||||||
|
vec := copod.FeatureVector{
|
||||||
|
NormalizedVector: vector.NormalizedVector,
|
||||||
|
Timestamp: vector.Timestamp,
|
||||||
|
}
|
||||||
|
|
||||||
|
// Score first, then append to the buffer so the scored point does not
|
||||||
|
// bias its own copula calculation (score-then-insert, same as RRCF).
|
||||||
|
res, err := c.detector.Score(vec)
|
||||||
|
if err != nil {
|
||||||
|
return types.AnomalyResult{}, fmt.Errorf("copod: score: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := c.update(vector); err != nil {
|
||||||
|
// Log but don't fail: the score is already computed.
|
||||||
|
log.Printf("copod: update after score: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
return types.AnomalyResult{
|
||||||
|
Timestamp: res.Timestamp,
|
||||||
|
Score: res.Score,
|
||||||
|
IsAnomaly: res.IsAnomaly,
|
||||||
|
Confidence: res.Confidence,
|
||||||
|
Method: res.Method,
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// update is the internal helper that adds vector to the copod sliding window.
|
||||||
|
func (c *COPODDetector) update(vector types.FeatureVector) error {
|
||||||
|
vec := copod.FeatureVector{
|
||||||
|
NormalizedVector: vector.NormalizedVector,
|
||||||
|
Timestamp: vector.Timestamp,
|
||||||
|
}
|
||||||
|
return c.detector.Update(vec)
|
||||||
|
}
|
||||||
325
internal/detect/ensemble.go
Normal file
325
internal/detect/ensemble.go
Normal file
|
|
@ -0,0 +1,325 @@
|
||||||
|
// Package detect provides anomaly detection algorithms and ensemble logic.
|
||||||
|
package detect
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"math"
|
||||||
|
"sort"
|
||||||
|
"strings"
|
||||||
|
"sync"
|
||||||
|
|
||||||
|
"codeberg.org/pata1704/guenther/pkg/types"
|
||||||
|
)
|
||||||
|
|
||||||
|
// EnsembleMethod selects the score-aggregation strategy used by EnsembleDetector.
|
||||||
|
type EnsembleMethod string
|
||||||
|
|
||||||
|
const (
|
||||||
|
// EnsembleAVG combines normalised sub-scores by arithmetic mean.
|
||||||
|
EnsembleAVG EnsembleMethod = "avg"
|
||||||
|
// EnsembleMAX takes the maximum of the normalised sub-scores (aggressive).
|
||||||
|
EnsembleMAX EnsembleMethod = "max"
|
||||||
|
// EnsembleMEDIAN uses the median of normalised sub-scores (robust to outliers).
|
||||||
|
EnsembleMEDIAN EnsembleMethod = "median"
|
||||||
|
// EnsembleSEAD delegates to an embedded SEADDetector (adaptive MWU weights).
|
||||||
|
// This method is selected by setting detector.ensemble.method = "sead" in
|
||||||
|
// the config. The four base detectors (MAD, RRCF, COPOD, IForest) are
|
||||||
|
// instantiated with the same parameters as the non-SEAD ensemble paths and
|
||||||
|
// the SEAD wrapper handles the online weight updates automatically.
|
||||||
|
EnsembleSEAD EnsembleMethod = "sead"
|
||||||
|
)
|
||||||
|
|
||||||
|
// RRCFVariantConfig holds parameters for a single named RRCF instance in the
|
||||||
|
// SEAD multi-horizon ensemble.
|
||||||
|
type RRCFVariantConfig struct {
|
||||||
|
// NumTrees controls score stability: more trees → smoother / more conservative.
|
||||||
|
NumTrees int
|
||||||
|
// TreeSize is the sliding-window capacity per tree.
|
||||||
|
TreeSize int
|
||||||
|
// ThresholdPercentile is the per-model decision threshold for standalone use.
|
||||||
|
ThresholdPercentile float64
|
||||||
|
}
|
||||||
|
|
||||||
|
// RRCFVariantsConfig groups the three RRCF horizon variants used by the SEAD ensemble.
|
||||||
|
// - Fast: short memory, reactive to transient spikes
|
||||||
|
// - Mid: medium memory, balanced sensitivity
|
||||||
|
// - Slow: long memory, detects sustained / slow-drift events
|
||||||
|
type RRCFVariantsConfig struct {
|
||||||
|
Fast RRCFVariantConfig
|
||||||
|
Mid RRCFVariantConfig
|
||||||
|
Slow RRCFVariantConfig
|
||||||
|
}
|
||||||
|
|
||||||
|
// EnsembleDetector implements the AnomalyDetector interface by combining
|
||||||
|
// COPOD and RRCF scores using min-max normalisation.
|
||||||
|
//
|
||||||
|
// Scoring strategy (AVG / MAX / MEDIAN methods):
|
||||||
|
// 1. Each model produces a raw score on its own scale.
|
||||||
|
// 2. Both scores are normalised to [0, 1] using a rolling min/max window.
|
||||||
|
// 3. The combined score is the result of the selected aggregation function.
|
||||||
|
// 4. A window is flagged anomalous when combinedScore > threshold where
|
||||||
|
// threshold = quantile(combinedHistory, 1-contamination).
|
||||||
|
//
|
||||||
|
// SEAD method:
|
||||||
|
//
|
||||||
|
// When method == EnsembleSEAD the detector delegates entirely to an embedded
|
||||||
|
// SEADDetector which wraps all four base detectors and uses Multiplicative
|
||||||
|
// Weights Update (MWU/FTRL) to adapt weights online. The COPOD and RRCF
|
||||||
|
// sub-detectors passed to NewEnsembleDetector are still created but are only
|
||||||
|
// used when method != EnsembleSEAD.
|
||||||
|
type EnsembleDetector struct {
|
||||||
|
method EnsembleMethod
|
||||||
|
|
||||||
|
// sub-detectors for AVG/MAX/MEDIAN methods
|
||||||
|
copod AnomalyDetector
|
||||||
|
rrcf AnomalyDetector
|
||||||
|
|
||||||
|
// SEAD method: fully adaptive ensemble (replaces copod+rrcf when active)
|
||||||
|
sead *SEADDetector
|
||||||
|
|
||||||
|
contamination float64
|
||||||
|
|
||||||
|
mu sync.Mutex
|
||||||
|
copodHistory []float64
|
||||||
|
rrcfHistory []float64
|
||||||
|
combinedHistory []float64
|
||||||
|
historySize int
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewEnsembleDetector initialises the multi-model ensemble.
|
||||||
|
//
|
||||||
|
// - method: "avg" | "max" | "median" | "sead"
|
||||||
|
// - copodBufferSize: sliding-window capacity for COPOD (≥ 100 recommended).
|
||||||
|
// - copodThreshold: per-model threshold passed to COPODDetector.
|
||||||
|
// - rrcfVariants: three-horizon RRCF config (fast/mid/slow). Used by SEAD;
|
||||||
|
// the Mid variant is also used for the classic AVG/MAX/MEDIAN path.
|
||||||
|
// - contamination: expected fraction of anomalies ∈ [0, 0.5).
|
||||||
|
// - seadCfg: SEAD parameters (only used when method == "sead").
|
||||||
|
// Pass detect.DefaultSEADConfig() when method != "sead".
|
||||||
|
func NewEnsembleDetector(
|
||||||
|
method EnsembleMethod,
|
||||||
|
copodBufferSize int, copodThreshold float64,
|
||||||
|
rrcfVariants RRCFVariantsConfig,
|
||||||
|
contamination float64,
|
||||||
|
seadCfg SEADConfig,
|
||||||
|
) (*EnsembleDetector, error) {
|
||||||
|
e := &EnsembleDetector{
|
||||||
|
method: method,
|
||||||
|
contamination: contamination,
|
||||||
|
historySize: 1000,
|
||||||
|
}
|
||||||
|
|
||||||
|
if method == EnsembleSEAD {
|
||||||
|
// Delegate to SEADDetector with all six base detectors (3 RRCF horizons).
|
||||||
|
// MAD is bootstrapped with identity priors (median=0, MAD=1); it will
|
||||||
|
// calibrate itself during the pipeline warm-up phase.
|
||||||
|
sead, err := NewSEADWithAllDetectors(
|
||||||
|
copodBufferSize, copodThreshold,
|
||||||
|
rrcfVariants,
|
||||||
|
3.5, 0, // madThreshold=3.5, madCalibSize=0→default 100 vectors
|
||||||
|
seadCfg,
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("ensemble: sead: %w", err)
|
||||||
|
}
|
||||||
|
e.sead = sead
|
||||||
|
} else {
|
||||||
|
// Classic AVG/MAX/MEDIAN path: only COPOD + RRCF (Mid variant as default).
|
||||||
|
copodDet, err := NewCOPODDetector(copodBufferSize, copodThreshold)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("ensemble: %w", err)
|
||||||
|
}
|
||||||
|
e.copod = copodDet
|
||||||
|
// Use Mid variant defaults for the classic ensemble path.
|
||||||
|
midTrees := rrcfVariants.Mid.NumTrees
|
||||||
|
if midTrees == 0 {
|
||||||
|
midTrees = 150
|
||||||
|
}
|
||||||
|
midSize := rrcfVariants.Mid.TreeSize
|
||||||
|
if midSize == 0 {
|
||||||
|
midSize = 64
|
||||||
|
}
|
||||||
|
midPct := rrcfVariants.Mid.ThresholdPercentile
|
||||||
|
if midPct == 0 {
|
||||||
|
midPct = 0.85
|
||||||
|
}
|
||||||
|
e.rrcf = NewRRCFDetector(midTrees, midSize, 0, midPct)
|
||||||
|
}
|
||||||
|
|
||||||
|
return e, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// SEAD returns the underlying SEADDetector if the ensemble is in SEAD mode.
|
||||||
|
func (e *EnsembleDetector) SEAD() *SEADDetector {
|
||||||
|
e.mu.Lock()
|
||||||
|
defer e.mu.Unlock()
|
||||||
|
return e.sead
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fit seeds the underlying models from a slice of feature vectors.
|
||||||
|
func (e *EnsembleDetector) Fit(vectors []types.FeatureVector) error {
|
||||||
|
if e.method == EnsembleSEAD {
|
||||||
|
return e.sead.Fit(vectors)
|
||||||
|
}
|
||||||
|
if err := e.copod.Fit(vectors); err != nil {
|
||||||
|
return fmt.Errorf("ensemble: fit copod: %w", err)
|
||||||
|
}
|
||||||
|
if err := e.rrcf.Fit(vectors); err != nil {
|
||||||
|
return fmt.Errorf("ensemble: fit rrcf: %w", err)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update propagates the vector to the underlying models.
|
||||||
|
func (e *EnsembleDetector) Update(vector types.FeatureVector) error {
|
||||||
|
if e.method == EnsembleSEAD {
|
||||||
|
return e.sead.Update(vector)
|
||||||
|
}
|
||||||
|
if err := e.copod.Update(vector); err != nil {
|
||||||
|
return fmt.Errorf("ensemble: update copod: %w", err)
|
||||||
|
}
|
||||||
|
if err := e.rrcf.Update(vector); err != nil {
|
||||||
|
return fmt.Errorf("ensemble: update rrcf: %w", err)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Score evaluates the feature vector.
|
||||||
|
//
|
||||||
|
// For SEAD method: delegates entirely to the embedded SEADDetector.
|
||||||
|
// For AVG/MAX/MEDIAN: min-max normalises COPOD and RRCF scores and aggregates.
|
||||||
|
func (e *EnsembleDetector) Score(vector types.FeatureVector) (types.AnomalyResult, error) {
|
||||||
|
if e.method == EnsembleSEAD {
|
||||||
|
res, err := e.sead.Score(vector)
|
||||||
|
if err != nil {
|
||||||
|
return types.AnomalyResult{}, fmt.Errorf("ensemble: sead score: %w", err)
|
||||||
|
}
|
||||||
|
return res, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
resCOPOD, err := e.copod.Score(vector)
|
||||||
|
if err != nil {
|
||||||
|
return types.AnomalyResult{}, fmt.Errorf("ensemble: score copod: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
resRRCF, err := e.rrcf.Score(vector)
|
||||||
|
if err != nil {
|
||||||
|
return types.AnomalyResult{}, fmt.Errorf("ensemble: score rrcf: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
e.mu.Lock()
|
||||||
|
defer e.mu.Unlock()
|
||||||
|
|
||||||
|
e.appendHistory(&e.copodHistory, resCOPOD.Score)
|
||||||
|
e.appendHistory(&e.rrcfHistory, resRRCF.Score)
|
||||||
|
|
||||||
|
normCOPOD := minMaxNorm(resCOPOD.Score, e.copodHistory)
|
||||||
|
normRRCF := minMaxNorm(resRRCF.Score, e.rrcfHistory)
|
||||||
|
|
||||||
|
var combined float64
|
||||||
|
switch e.method {
|
||||||
|
case EnsembleMAX:
|
||||||
|
combined = math.Max(normCOPOD, normRRCF)
|
||||||
|
case EnsembleMEDIAN:
|
||||||
|
// Median of two values = average; kept for future N>2 extension.
|
||||||
|
vals := []float64{normCOPOD, normRRCF}
|
||||||
|
sort.Float64s(vals)
|
||||||
|
combined = vals[len(vals)/2]
|
||||||
|
default: // EnsembleAVG
|
||||||
|
combined = (normCOPOD + normRRCF) / 2.0
|
||||||
|
}
|
||||||
|
|
||||||
|
e.appendHistory(&e.combinedHistory, combined)
|
||||||
|
|
||||||
|
const minDataPoints = 10
|
||||||
|
threshold := quantile(e.combinedHistory, 1.0-e.contamination)
|
||||||
|
isAnomaly := len(e.combinedHistory) > minDataPoints && combined > threshold
|
||||||
|
|
||||||
|
return types.AnomalyResult{
|
||||||
|
Timestamp: vector.Timestamp,
|
||||||
|
Score: combined,
|
||||||
|
IsAnomaly: isAnomaly,
|
||||||
|
Confidence: math.Min(combined/math.Max(threshold, 1e-9), 1.0),
|
||||||
|
Method: e.methodString(string(e.method), resCOPOD.IsAnomaly, resRRCF.IsAnomaly),
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// WeightSummary returns the current SEAD detector weights as a human-readable
|
||||||
|
// string. Returns "" when the ensemble is not using SEAD.
|
||||||
|
func (e *EnsembleDetector) WeightSummary() string {
|
||||||
|
if e.method != EnsembleSEAD || e.sead == nil {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
return e.sead.WeightSummary()
|
||||||
|
}
|
||||||
|
|
||||||
|
// appendHistory appends v to *h, evicting the oldest entry when full.
|
||||||
|
// Caller must hold e.mu.
|
||||||
|
func (e *EnsembleDetector) appendHistory(h *[]float64, v float64) {
|
||||||
|
*h = append(*h, v)
|
||||||
|
if len(*h) > e.historySize {
|
||||||
|
*h = (*h)[1:]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// methodString builds a concise label for AnomalyResult.Method.
|
||||||
|
func (e *EnsembleDetector) methodString(method string, copodAnomaly, rrcfAnomaly bool) string {
|
||||||
|
var active []string
|
||||||
|
if copodAnomaly {
|
||||||
|
active = append(active, "COPOD")
|
||||||
|
}
|
||||||
|
if rrcfAnomaly {
|
||||||
|
active = append(active, "RRCF")
|
||||||
|
}
|
||||||
|
if len(active) > 0 {
|
||||||
|
return fmt.Sprintf("Ensemble-%s(%s)", strings.ToUpper(method), strings.Join(active, "+"))
|
||||||
|
}
|
||||||
|
return fmt.Sprintf("Ensemble-%s(none)", strings.ToUpper(method))
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── score helpers ─────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
// minMaxNorm normalises v into [0, 1] using the observed min/max of history.
|
||||||
|
func minMaxNorm(v float64, history []float64) float64 {
|
||||||
|
if len(history) == 0 {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
minV, maxV := history[0], history[0]
|
||||||
|
for _, h := range history[1:] {
|
||||||
|
if h < minV {
|
||||||
|
minV = h
|
||||||
|
}
|
||||||
|
if h > maxV {
|
||||||
|
maxV = h
|
||||||
|
}
|
||||||
|
}
|
||||||
|
spread := maxV - minV
|
||||||
|
if spread < 1e-12 {
|
||||||
|
return 0.5
|
||||||
|
}
|
||||||
|
norm := (v - minV) / spread
|
||||||
|
if norm < 0 {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
if norm > 1 {
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
return norm
|
||||||
|
}
|
||||||
|
|
||||||
|
// quantile returns the p-th quantile of data without modifying the slice.
|
||||||
|
func quantile(data []float64, p float64) float64 {
|
||||||
|
n := len(data)
|
||||||
|
if n == 0 {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
sorted := make([]float64, n)
|
||||||
|
copy(sorted, data)
|
||||||
|
sort.Float64s(sorted)
|
||||||
|
|
||||||
|
idx := int(float64(n) * p)
|
||||||
|
if idx >= n {
|
||||||
|
idx = n - 1
|
||||||
|
}
|
||||||
|
return sorted[idx]
|
||||||
|
}
|
||||||
200
internal/detect/iforest.go
Normal file
200
internal/detect/iforest.go
Normal file
|
|
@ -0,0 +1,200 @@
|
||||||
|
package detect
|
||||||
|
|
||||||
|
import (
|
||||||
|
"log"
|
||||||
|
"sync"
|
||||||
|
|
||||||
|
"codeberg.org/pata1704/guenther/pkg/types"
|
||||||
|
"github.com/e-XpertSolutions/go-iforest/iforest"
|
||||||
|
)
|
||||||
|
|
||||||
|
// IsolationForestDetector wraps go-iforest with thread-safe access and
|
||||||
|
// continuous background retraining on non-anomalous data to handle concept drift.
|
||||||
|
//
|
||||||
|
// During the warmup phase (model == nil) incoming vectors are buffered.
|
||||||
|
// Once warmupSize vectors have accumulated, the first training run executes
|
||||||
|
// synchronously so that the detector is never in an undefined trained state
|
||||||
|
// after the first window tick.
|
||||||
|
//
|
||||||
|
// Subsequent retraining is asynchronous: when trainingBuffer reaches
|
||||||
|
// bufferSize the buffer is swapped out under the lock, and training runs in
|
||||||
|
// a detached goroutine. The current model remains active during retraining,
|
||||||
|
// so scoring never blocks.
|
||||||
|
type IsolationForestDetector struct {
|
||||||
|
mu sync.RWMutex
|
||||||
|
model *iforest.Forest
|
||||||
|
trainingBuffer []types.FeatureVector
|
||||||
|
|
||||||
|
// Tuning knobs – set via constructor.
|
||||||
|
numTrees int
|
||||||
|
subSample int
|
||||||
|
contamination float64
|
||||||
|
bufferSize int
|
||||||
|
warmupSize int
|
||||||
|
threshold float64
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewIsolationForestDetector creates a detector with the given parameters.
|
||||||
|
//
|
||||||
|
// - bufferSize: number of non-anomalous vectors to accumulate before
|
||||||
|
// triggering background retraining.
|
||||||
|
// - warmupSize: number of vectors to accumulate before the first (sync)
|
||||||
|
// training run. Must be ≤ bufferSize.
|
||||||
|
// - numTrees: number of isolation trees (typically 100).
|
||||||
|
// - subSample: subsample size per tree (typically 256).
|
||||||
|
// - contamination: expected fraction of anomalies (0 < c < 0.5).
|
||||||
|
// - threshold: score cutoff for IsAnomaly.
|
||||||
|
func NewIsolationForestDetector(
|
||||||
|
bufferSize, warmupSize, numTrees, subSample int,
|
||||||
|
contamination, threshold float64,
|
||||||
|
) *IsolationForestDetector {
|
||||||
|
if warmupSize <= 0 || warmupSize > bufferSize {
|
||||||
|
warmupSize = bufferSize
|
||||||
|
}
|
||||||
|
return &IsolationForestDetector{
|
||||||
|
bufferSize: bufferSize,
|
||||||
|
warmupSize: warmupSize,
|
||||||
|
numTrees: numTrees,
|
||||||
|
subSample: subSample,
|
||||||
|
contamination: contamination,
|
||||||
|
threshold: threshold,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fit trains a new Isolation Forest on vectors.
|
||||||
|
// Fit is safe to call concurrently with Score (uses a write lock only while
|
||||||
|
// swapping the model pointer).
|
||||||
|
func (d *IsolationForestDetector) Fit(vectors []types.FeatureVector) error {
|
||||||
|
if len(vectors) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
data := convertToMatrix(vectors)
|
||||||
|
forest := iforest.NewForest(d.numTrees, d.subSample, d.contamination)
|
||||||
|
forest.Train(data)
|
||||||
|
forest.Test(data)
|
||||||
|
|
||||||
|
d.mu.Lock()
|
||||||
|
d.model = forest
|
||||||
|
d.mu.Unlock()
|
||||||
|
|
||||||
|
log.Printf("iforest: trained on %d samples (trees=%d, subsample=%d, contamination=%.3f)",
|
||||||
|
len(vectors), d.numTrees, d.subSample, d.contamination)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Score returns an AnomalyResult for vector.
|
||||||
|
//
|
||||||
|
// Pre-model (warmup) behaviour:
|
||||||
|
// - Vector is appended to trainingBuffer.
|
||||||
|
// - Once warmupSize is reached the first training run executes synchronously
|
||||||
|
// on the calling goroutine so subsequent Score calls have a model.
|
||||||
|
// - Returns score=0, IsAnomaly=false while warming up.
|
||||||
|
//
|
||||||
|
// Post-model behaviour:
|
||||||
|
// - Score is computed via the active model (read-lock only).
|
||||||
|
// - Non-anomalous vectors are appended to trainingBuffer.
|
||||||
|
// - When trainingBuffer reaches bufferSize, a background retrain fires.
|
||||||
|
func (d *IsolationForestDetector) Score(vector types.FeatureVector) (types.AnomalyResult, error) {
|
||||||
|
warmup := types.AnomalyResult{
|
||||||
|
Timestamp: vector.Timestamp,
|
||||||
|
Score: 0,
|
||||||
|
IsAnomaly: false,
|
||||||
|
Method: "IF",
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── warmup phase ──────────────────────────────────────────────────────
|
||||||
|
d.mu.RLock()
|
||||||
|
model := d.model
|
||||||
|
d.mu.RUnlock()
|
||||||
|
|
||||||
|
if model == nil {
|
||||||
|
d.mu.Lock()
|
||||||
|
d.trainingBuffer = append(d.trainingBuffer, vector)
|
||||||
|
bufLen := len(d.trainingBuffer)
|
||||||
|
d.mu.Unlock()
|
||||||
|
|
||||||
|
if bufLen < d.warmupSize {
|
||||||
|
return warmup, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Synchronous first fit to eliminate the cold-start gap.
|
||||||
|
d.mu.Lock()
|
||||||
|
buf := d.trainingBuffer
|
||||||
|
d.trainingBuffer = nil
|
||||||
|
d.mu.Unlock()
|
||||||
|
|
||||||
|
if err := d.Fit(buf); err != nil {
|
||||||
|
return warmup, err
|
||||||
|
}
|
||||||
|
|
||||||
|
d.mu.RLock()
|
||||||
|
model = d.model
|
||||||
|
d.mu.RUnlock()
|
||||||
|
|
||||||
|
if model == nil {
|
||||||
|
return warmup, nil // Fit failed silently – defensive
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── inference ─────────────────────────────────────────────────────────
|
||||||
|
_, scores, err := model.Predict([][]float64{vector.NormalizedVector})
|
||||||
|
if err != nil {
|
||||||
|
return warmup, err
|
||||||
|
}
|
||||||
|
if len(scores) == 0 {
|
||||||
|
return warmup, nil
|
||||||
|
}
|
||||||
|
score := scores[0]
|
||||||
|
|
||||||
|
res := types.AnomalyResult{
|
||||||
|
Timestamp: vector.Timestamp,
|
||||||
|
Score: score,
|
||||||
|
IsAnomaly: score > d.threshold,
|
||||||
|
Confidence: score,
|
||||||
|
Method: "IF",
|
||||||
|
}
|
||||||
|
|
||||||
|
// Buffer non-anomalous vectors for background retraining.
|
||||||
|
if !res.IsAnomaly {
|
||||||
|
if err := d.Update(vector); err != nil {
|
||||||
|
log.Printf("iforest: update buffer: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return res, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update appends a non-anomalous vector to the training buffer.
|
||||||
|
// If the buffer is full it is swapped atomically and a background goroutine
|
||||||
|
// retrains the model on the captured data.
|
||||||
|
func (d *IsolationForestDetector) Update(vector types.FeatureVector) error {
|
||||||
|
d.mu.Lock()
|
||||||
|
d.trainingBuffer = append(d.trainingBuffer, vector)
|
||||||
|
|
||||||
|
if len(d.trainingBuffer) < d.bufferSize {
|
||||||
|
d.mu.Unlock()
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
buf := make([]types.FeatureVector, len(d.trainingBuffer))
|
||||||
|
copy(buf, d.trainingBuffer)
|
||||||
|
d.trainingBuffer = nil
|
||||||
|
d.mu.Unlock()
|
||||||
|
|
||||||
|
go func() {
|
||||||
|
if err := d.Fit(buf); err != nil {
|
||||||
|
log.Printf("iforest: background retrain: %v", err)
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── helpers ───────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
func convertToMatrix(vectors []types.FeatureVector) [][]float64 {
|
||||||
|
m := make([][]float64, len(vectors))
|
||||||
|
for i, v := range vectors {
|
||||||
|
m[i] = v.NormalizedVector
|
||||||
|
}
|
||||||
|
return m
|
||||||
|
}
|
||||||
148
internal/detect/interface.go
Normal file
148
internal/detect/interface.go
Normal file
|
|
@ -0,0 +1,148 @@
|
||||||
|
package detect
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"log"
|
||||||
|
"sync"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"codeberg.org/pata1704/guenther/pkg/types"
|
||||||
|
)
|
||||||
|
|
||||||
|
// AnomalyDetector is the common interface for all detection algorithms.
|
||||||
|
// Implementations must be safe for concurrent use.
|
||||||
|
type AnomalyDetector interface {
|
||||||
|
// Fit trains the model on the supplied slice of labelled-normal vectors.
|
||||||
|
Fit(vectors []types.FeatureVector) error
|
||||||
|
// Score returns an anomaly assessment for vector. It must not block.
|
||||||
|
Score(vector types.FeatureVector) (types.AnomalyResult, error)
|
||||||
|
// Update buffers vector for incremental model updates.
|
||||||
|
Update(vector types.FeatureVector) error
|
||||||
|
}
|
||||||
|
|
||||||
|
// DetectionLayer reads FeatureVectors from inputChan, scores them with the
|
||||||
|
// configured AnomalyDetector, and forwards AnomalyResults to outputChan.
|
||||||
|
//
|
||||||
|
// The layer runs a single event-loop goroutine (no additional worker pool is
|
||||||
|
// needed because detection is CPU-bound in a single model, not I/O-bound).
|
||||||
|
// Health metrics are emitted to healthChan every 5 seconds.
|
||||||
|
//
|
||||||
|
// Backpressure: if outputChan is full the result is dropped and a warning is
|
||||||
|
// logged. This prevents the detection goroutine from blocking the upstream
|
||||||
|
// TransformEngine via backpressure handling.
|
||||||
|
type DetectionLayer struct {
|
||||||
|
detector AnomalyDetector
|
||||||
|
inputChan <-chan types.FeatureVector
|
||||||
|
outputChan chan<- types.AnomalyResult
|
||||||
|
healthChan chan<- types.StageHealth
|
||||||
|
|
||||||
|
scalingController *ScalingController // optional
|
||||||
|
|
||||||
|
wg sync.WaitGroup
|
||||||
|
|
||||||
|
mu sync.Mutex
|
||||||
|
processed uint64
|
||||||
|
dropped uint64
|
||||||
|
avgLatency float64
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewDetectionLayer constructs a DetectionLayer wired to the given channels.
|
||||||
|
func NewDetectionLayer(
|
||||||
|
detector AnomalyDetector,
|
||||||
|
input <-chan types.FeatureVector,
|
||||||
|
output chan<- types.AnomalyResult,
|
||||||
|
health chan<- types.StageHealth,
|
||||||
|
) *DetectionLayer {
|
||||||
|
return &DetectionLayer{
|
||||||
|
detector: detector,
|
||||||
|
inputChan: input,
|
||||||
|
outputChan: output,
|
||||||
|
healthChan: health,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// SetScalingController attaches an auto-scaling controller to the layer.
|
||||||
|
func (l *DetectionLayer) SetScalingController(sc *ScalingController) {
|
||||||
|
l.scalingController = sc
|
||||||
|
}
|
||||||
|
|
||||||
|
// Start launches the detection event loop in a background goroutine.
|
||||||
|
// The method is idempotent: calling Start twice panics (close of closed channel).
|
||||||
|
func (l *DetectionLayer) Start(ctx context.Context) {
|
||||||
|
l.wg.Go(func() {
|
||||||
|
reportTicker := time.NewTicker(5 * time.Second)
|
||||||
|
defer reportTicker.Stop()
|
||||||
|
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case fv := <-l.inputChan:
|
||||||
|
l.handle(fv)
|
||||||
|
|
||||||
|
case <-reportTicker.C:
|
||||||
|
l.emitHealth()
|
||||||
|
|
||||||
|
case <-ctx.Done():
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// Wait waits for the event loop to exit after context cancellation.
|
||||||
|
func (l *DetectionLayer) Wait() {
|
||||||
|
l.wg.Wait()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (l *DetectionLayer) handle(fv types.FeatureVector) {
|
||||||
|
if l.scalingController != nil {
|
||||||
|
l.scalingController.ObserveCPU(fv.AvgCPUPercent)
|
||||||
|
}
|
||||||
|
|
||||||
|
start := time.Now()
|
||||||
|
result, err := l.detector.Score(fv)
|
||||||
|
ms := time.Since(start).Seconds() * 1e3
|
||||||
|
|
||||||
|
l.mu.Lock()
|
||||||
|
l.processed++
|
||||||
|
if l.avgLatency == 0 {
|
||||||
|
l.avgLatency = ms
|
||||||
|
} else {
|
||||||
|
l.avgLatency = l.avgLatency*0.8 + ms*0.2
|
||||||
|
}
|
||||||
|
l.mu.Unlock()
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
log.Printf("detection: score error: %v", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
select {
|
||||||
|
case l.outputChan <- result:
|
||||||
|
default:
|
||||||
|
l.mu.Lock()
|
||||||
|
l.dropped++
|
||||||
|
l.mu.Unlock()
|
||||||
|
log.Printf("detection: output channel full – dropping result (score=%.4f)", result.Score)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// emitHealth sends a StageHealth snapshot to healthChan.
|
||||||
|
// Non-blocking: skips the report if healthChan is full.
|
||||||
|
func (l *DetectionLayer) emitHealth() {
|
||||||
|
l.mu.Lock()
|
||||||
|
p := l.processed
|
||||||
|
d := l.dropped
|
||||||
|
avg := l.avgLatency
|
||||||
|
l.mu.Unlock()
|
||||||
|
|
||||||
|
select {
|
||||||
|
case l.healthChan <- types.StageHealth{
|
||||||
|
StageName: "detection_layer",
|
||||||
|
EventsProcessed: p,
|
||||||
|
EventsDropped: d,
|
||||||
|
AvgLatencyMs: avg,
|
||||||
|
LastUpdate: time.Now(),
|
||||||
|
}:
|
||||||
|
default:
|
||||||
|
}
|
||||||
|
}
|
||||||
254
internal/detect/mad.go
Normal file
254
internal/detect/mad.go
Normal file
|
|
@ -0,0 +1,254 @@
|
||||||
|
// Package detect provides anomaly detection algorithms and ensemble logic.
|
||||||
|
package detect
|
||||||
|
|
||||||
|
import (
|
||||||
|
"log"
|
||||||
|
"math"
|
||||||
|
"sort"
|
||||||
|
"sync"
|
||||||
|
|
||||||
|
"codeberg.org/pata1704/guenther/pkg/types"
|
||||||
|
)
|
||||||
|
|
||||||
|
// MADDetector scores feature vectors using per-feature Median Absolute
|
||||||
|
// Deviation (MAD) with pre-calibrated or automatically derived statistics.
|
||||||
|
//
|
||||||
|
// Pass nil for medians and mads and set calibrationSize > 0 via
|
||||||
|
// NewMADDetectorAutoCalibrate. The detector buffers the first calibrationSize
|
||||||
|
// NormalizedVectors, computes per-feature statistics once the buffer is full,
|
||||||
|
// and starts scoring normally afterwards. During the warmup phase Score
|
||||||
|
// returns score=0 / IsAnomaly=false.
|
||||||
|
//
|
||||||
|
// detector := NewMADDetectorAutoCalibrate(3.5, 100)
|
||||||
|
//
|
||||||
|
// SEAD down-weights MAD automatically during the warmup phase because
|
||||||
|
// all scores are zero; once calibration completes SEAD will start to
|
||||||
|
// consider MAD scores in its weight updates.
|
||||||
|
//
|
||||||
|
// # Calibration contract
|
||||||
|
//
|
||||||
|
// The medians and mads slices must be computed from the SAME representation
|
||||||
|
// that arrives in vector.NormalizedVector – i.e. from the RobustScaler-scaled
|
||||||
|
// feature vectors, NOT from raw window aggregates.
|
||||||
|
//
|
||||||
|
// # Scoring
|
||||||
|
//
|
||||||
|
// For each feature i the modified Z-score is:
|
||||||
|
//
|
||||||
|
// score_i = |x_i - median_i| / (1.4826 * MAD_i)
|
||||||
|
//
|
||||||
|
// The constant 1.4826 ≈ 1/(Φ⁻¹(3/4)) makes MAD a consistent estimator of σ
|
||||||
|
// under normality (Rousseeuw & Croux, 1993). The anomaly score is the maximum
|
||||||
|
// modified Z-score across all features.
|
||||||
|
//
|
||||||
|
// # Fit / Update
|
||||||
|
//
|
||||||
|
// When calibration is already complete, Fit replaces the
|
||||||
|
// current statistics with values derived from the supplied vectors. Update is a
|
||||||
|
// no-op.
|
||||||
|
type MADDetector struct {
|
||||||
|
mu sync.Mutex
|
||||||
|
threshold float64
|
||||||
|
medians []float64 // per-feature median of NormalizedVector in baseline
|
||||||
|
mads []float64 // per-feature MAD of NormalizedVector in baseline
|
||||||
|
|
||||||
|
// Auto-calibration state. calibrationSize == 0 means disabled.
|
||||||
|
calibrationSize int
|
||||||
|
calibrationBuf [][]float64 // collected NormalizedVectors during warmup
|
||||||
|
calibrated bool
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewMADDetector creates a MADDetector with pre-calibrated baseline statistics.
|
||||||
|
//
|
||||||
|
// - threshold: anomaly score cutoff (modified Z-score). Typical: 2.5–4.0.
|
||||||
|
// - medians: per-feature median computed from NormalizedVector in baseline.
|
||||||
|
// - mads: per-feature MAD computed from NormalizedVector in baseline.
|
||||||
|
// Zero entries are replaced with 1.0 to avoid division-by-zero.
|
||||||
|
//
|
||||||
|
// Pass nil for medians and mads only when calibrationSize > 0 is set via
|
||||||
|
// NewMADDetectorAutoCalibrate; otherwise all scores will be zero.
|
||||||
|
func NewMADDetector(threshold float64, medians, mads []float64) *MADDetector {
|
||||||
|
return &MADDetector{
|
||||||
|
threshold: threshold,
|
||||||
|
medians: medians,
|
||||||
|
mads: mads,
|
||||||
|
calibrated: len(medians) > 0,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewMADDetectorAutoCalibrate creates a MADDetector that derives its own
|
||||||
|
// per-feature statistics from the first calibrationSize NormalizedVectors
|
||||||
|
// it encounters in Score.
|
||||||
|
//
|
||||||
|
// - threshold: modified Z-score cutoff after calibration. Typical: 3.5.
|
||||||
|
// - calibrationSize: number of vectors to buffer before first calibration.
|
||||||
|
// Recommended: 60–200
|
||||||
|
func NewMADDetectorAutoCalibrate(threshold float64, calibrationSize int) *MADDetector {
|
||||||
|
if calibrationSize <= 0 {
|
||||||
|
calibrationSize = 100
|
||||||
|
}
|
||||||
|
// Initialise with "Identity" stats (median=0, mad=1) so the detector is
|
||||||
|
// operational immediately with a global sensitivity of 1.0 (baseline IQR).
|
||||||
|
// Features are already RobustScaled by DuckDB, so this is a sane prior.
|
||||||
|
// Automatic calibration will refine these once the buffer is full.
|
||||||
|
return &MADDetector{
|
||||||
|
threshold: threshold,
|
||||||
|
calibrationSize: calibrationSize,
|
||||||
|
medians: nil, // will be Lazy-init or from buffer
|
||||||
|
mads: nil,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fit recomputes per-feature median and MAD from the supplied vectors,
|
||||||
|
// replacing any prior calibration. Safe to call concurrently with Score.
|
||||||
|
func (m *MADDetector) Fit(vectors []types.FeatureVector) error {
|
||||||
|
if len(vectors) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
raw := make([][]float64, len(vectors))
|
||||||
|
for i, v := range vectors {
|
||||||
|
raw[i] = v.NormalizedVector
|
||||||
|
}
|
||||||
|
medians, mads := computeMADStats(raw)
|
||||||
|
|
||||||
|
m.mu.Lock()
|
||||||
|
m.medians = medians
|
||||||
|
m.mads = mads
|
||||||
|
m.calibrated = true
|
||||||
|
m.calibrationBuf = nil
|
||||||
|
m.mu.Unlock()
|
||||||
|
|
||||||
|
log.Printf("mad: fitted on %d vectors (%d features)", len(vectors), len(medians))
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update is a no-op when manual statistics are used. When auto-calibration is
|
||||||
|
// active it is equivalent to calling Score but discards the result.
|
||||||
|
func (m *MADDetector) Update(v types.FeatureVector) error {
|
||||||
|
_, _ = m.Score(v)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Score computes the maximum modified Z-score across all features of vector.
|
||||||
|
//
|
||||||
|
// During the auto-calibration warmup the vector is buffered and a zero-score
|
||||||
|
// result is returned. Once the calibration buffer is full the statistics are
|
||||||
|
// derived automatically and scoring starts on the next call.
|
||||||
|
//
|
||||||
|
// vector.NormalizedVector must contain values on the same scale as the
|
||||||
|
// medians and mads slices (i.e. RobustScaler-scaled values from DuckDB).
|
||||||
|
func (m *MADDetector) Score(vector types.FeatureVector) (types.AnomalyResult, error) {
|
||||||
|
m.mu.Lock()
|
||||||
|
// ── Auto-calibration warmup ───────────────────────────────────────────
|
||||||
|
if !m.calibrated && m.calibrationSize > 0 {
|
||||||
|
if vec := vector.NormalizedVector; len(vec) > 0 {
|
||||||
|
cp := make([]float64, len(vec))
|
||||||
|
copy(cp, vec)
|
||||||
|
m.calibrationBuf = append(m.calibrationBuf, cp)
|
||||||
|
}
|
||||||
|
if len(m.calibrationBuf) >= m.calibrationSize {
|
||||||
|
m.medians, m.mads = computeMADStats(m.calibrationBuf)
|
||||||
|
m.calibrated = true
|
||||||
|
m.calibrationBuf = nil
|
||||||
|
log.Printf("mad: auto-calibrated on %d vectors (%d features)",
|
||||||
|
m.calibrationSize, len(m.medians))
|
||||||
|
}
|
||||||
|
if !m.calibrated {
|
||||||
|
m.mu.Unlock()
|
||||||
|
return m.scoreIdentity(vector), nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
medians := m.medians
|
||||||
|
mads := m.mads
|
||||||
|
m.mu.Unlock()
|
||||||
|
|
||||||
|
// ── Scoring ───────────────────────────────────────────────────────────
|
||||||
|
maxScore := 0.0
|
||||||
|
for i, val := range vector.NormalizedVector {
|
||||||
|
if i >= len(medians) || i >= len(mads) {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
// Stability floor: prevent explosive Z-scores for features with near-zero variance.
|
||||||
|
// 1e-2 corresponds to 1% of the original baseline IQR.
|
||||||
|
mad := math.Max(mads[i], 0.01)
|
||||||
|
|
||||||
|
// 1.4826 converts MAD to an estimator of standard deviation.
|
||||||
|
score := math.Abs(val-medians[i]) / (1.4826 * mad)
|
||||||
|
if score > maxScore {
|
||||||
|
maxScore = score
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return types.AnomalyResult{
|
||||||
|
Timestamp: vector.Timestamp,
|
||||||
|
Score: maxScore,
|
||||||
|
IsAnomaly: maxScore > m.threshold,
|
||||||
|
Confidence: math.Min(maxScore/math.Max(m.threshold, 1e-9), 1.0),
|
||||||
|
Method: "MAD",
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// scoreIdentity provides a sane fallback (median=0, mad=1) for pre-scaled data.
|
||||||
|
func (m *MADDetector) scoreIdentity(vector types.FeatureVector) types.AnomalyResult {
|
||||||
|
maxScore := 0.0
|
||||||
|
for _, val := range vector.NormalizedVector {
|
||||||
|
score := math.Abs(val) / 0.6745 // 1/1.4826
|
||||||
|
if score > maxScore {
|
||||||
|
maxScore = score
|
||||||
|
}
|
||||||
|
}
|
||||||
|
res := types.AnomalyResult{
|
||||||
|
Timestamp: vector.Timestamp,
|
||||||
|
Score: maxScore,
|
||||||
|
IsAnomaly: maxScore > m.threshold,
|
||||||
|
Confidence: math.Min(maxScore/math.Max(m.threshold, 1e-9), 1.0),
|
||||||
|
Method: "MAD (warmup)",
|
||||||
|
}
|
||||||
|
if res.IsAnomaly {
|
||||||
|
res.Details = "Detected during MAD auto-calibration warmup period (using identity prior)."
|
||||||
|
}
|
||||||
|
return res
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── calibration helper ────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
// computeMADStats returns per-feature median and MAD for a matrix of row vectors.
|
||||||
|
// Both slices have length equal to the number of features (columns).
|
||||||
|
func computeMADStats(rows [][]float64) (medians, mads []float64) {
|
||||||
|
if len(rows) == 0 {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
nFeatures := len(rows[0])
|
||||||
|
medians = make([]float64, nFeatures)
|
||||||
|
mads = make([]float64, nFeatures)
|
||||||
|
|
||||||
|
col := make([]float64, len(rows))
|
||||||
|
devs := make([]float64, len(rows))
|
||||||
|
for f := range nFeatures {
|
||||||
|
for r, row := range rows {
|
||||||
|
if f < len(row) {
|
||||||
|
col[r] = row[f]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
med := median(col)
|
||||||
|
medians[f] = med
|
||||||
|
for r, v := range col {
|
||||||
|
devs[r] = math.Abs(v - med)
|
||||||
|
}
|
||||||
|
mads[f] = median(devs)
|
||||||
|
}
|
||||||
|
return medians, mads
|
||||||
|
}
|
||||||
|
|
||||||
|
// median returns the median of xs. xs is modified in-place (sorted).
|
||||||
|
func median(xs []float64) float64 {
|
||||||
|
n := len(xs)
|
||||||
|
if n == 0 {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
sort.Float64s(xs)
|
||||||
|
if n%2 == 1 {
|
||||||
|
return xs[n/2]
|
||||||
|
}
|
||||||
|
return (xs[n/2-1] + xs[n/2]) / 2.0
|
||||||
|
}
|
||||||
114
internal/detect/mad_test.go
Normal file
114
internal/detect/mad_test.go
Normal file
|
|
@ -0,0 +1,114 @@
|
||||||
|
package detect
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"codeberg.org/pata1704/guenther/pkg/types"
|
||||||
|
"github.com/stretchr/testify/assert"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestMADDetector_Score(t *testing.T) {
|
||||||
|
detector := NewMADDetector(3.0, []float64{10.0}, []float64{1.0})
|
||||||
|
|
||||||
|
// 1. Score a normal value
|
||||||
|
res, err := detector.Score(types.FeatureVector{
|
||||||
|
Timestamp: time.Now(),
|
||||||
|
NormalizedVector: []float64{11},
|
||||||
|
})
|
||||||
|
assert.NoError(t, err)
|
||||||
|
assert.False(t, res.IsAnomaly, "Value 11 should not be an anomaly")
|
||||||
|
|
||||||
|
// 2. Score an extreme outlier
|
||||||
|
res, err = detector.Score(types.FeatureVector{
|
||||||
|
Timestamp: time.Now(),
|
||||||
|
NormalizedVector: []float64{100},
|
||||||
|
})
|
||||||
|
assert.NoError(t, err)
|
||||||
|
assert.True(t, res.IsAnomaly, "Value 100 should be an anomaly")
|
||||||
|
assert.Greater(t, res.Score, 3.0)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestMADDetector_CalibrationStability(t *testing.T) {
|
||||||
|
// 1. Create a detector that auto-calibrates on 100 idle vectors.
|
||||||
|
detector := NewMADDetectorAutoCalibrate(3.5, 100)
|
||||||
|
now := time.Now()
|
||||||
|
|
||||||
|
// 2. Feed 99 perfectly idle vectors.
|
||||||
|
// They should all use "Identity" fallback and return low scores (or 0 if val is 0).
|
||||||
|
for i := 0; i < 99; i++ {
|
||||||
|
fv := types.FeatureVector{
|
||||||
|
Timestamp: now.Add(time.Duration(i) * time.Second),
|
||||||
|
NormalizedVector: []float64{0.0, 0.0},
|
||||||
|
}
|
||||||
|
res, err := detector.Score(fv)
|
||||||
|
assert.NoError(t, err)
|
||||||
|
assert.Equal(t, 0.0, res.Score)
|
||||||
|
assert.Contains(t, res.Method, "warmup")
|
||||||
|
}
|
||||||
|
|
||||||
|
// 3. Feed the 100th vector. This triggers calibration.
|
||||||
|
// Since all 100 vectors were 0, the learned medians will be 0 and mads will be 0.
|
||||||
|
fv100 := types.FeatureVector{
|
||||||
|
Timestamp: now.Add(100 * time.Second),
|
||||||
|
NormalizedVector: []float64{0.0, 0.0},
|
||||||
|
}
|
||||||
|
res100, err := detector.Score(fv100)
|
||||||
|
assert.NoError(t, err)
|
||||||
|
assert.Equal(t, 0.0, res100.Score)
|
||||||
|
// After this call, mads should be [0.0, 0.0] but clamped to 0.01 during Score.
|
||||||
|
|
||||||
|
// 4. Feed the 101st vector: A "normal" burst (e.g. 1.0 baseline IQR).
|
||||||
|
// Without the floor, this would be 1.0 / (1.48 * 0) -> infinity (clamped).
|
||||||
|
// With the floor (0.01), it should be 1.0 / (1.4826 * 0.01) ≈ 67.45.
|
||||||
|
fv101 := types.FeatureVector{
|
||||||
|
Timestamp: now.Add(101 * time.Second),
|
||||||
|
NormalizedVector: []float64{1.0, 0.0},
|
||||||
|
}
|
||||||
|
res101, err := detector.Score(fv101)
|
||||||
|
assert.NoError(t, err)
|
||||||
|
|
||||||
|
// Check that the score is contained.
|
||||||
|
// 1.0 / (1.4826 * 0.01) = 67.449
|
||||||
|
assert.InDelta(t, 67.449, res101.Score, 0.1)
|
||||||
|
assert.True(t, res101.IsAnomaly)
|
||||||
|
assert.Equal(t, "MAD", res101.Method) // No longer "warmup"
|
||||||
|
|
||||||
|
// 5. Test with a very small variance but not 0.
|
||||||
|
// Suppose learned MAD was 0.0001. Score for val=1.0 would be 1.0 / 0.000148... ≈ 6745.
|
||||||
|
// Our floor (0.01) should still clamp this to 67.45.
|
||||||
|
detector.mu.Lock()
|
||||||
|
detector.mads = []float64{0.0001, 0.0}
|
||||||
|
detector.medians = []float64{0.0, 0.0}
|
||||||
|
detector.mu.Unlock()
|
||||||
|
|
||||||
|
resSmall, err := detector.Score(fv101)
|
||||||
|
assert.NoError(t, err)
|
||||||
|
assert.InDelta(t, 67.449, resSmall.Score, 0.1)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestMADDetector_IdentityPrior(t *testing.T) {
|
||||||
|
detector := NewMADDetectorAutoCalibrate(3.5, 10)
|
||||||
|
|
||||||
|
// Feature vector with a deviation of 2.0 baseline IQR.
|
||||||
|
// Using identity prior (mad=1.0), the score should be:
|
||||||
|
// score = |2.0| / (1.4826 * 1.0) = 2.0 / 1.4826 ≈ 1.3489
|
||||||
|
// Wait, scoreIdentity uses 0.6745 directly: math.Abs(val) / 0.6745
|
||||||
|
// 2.0 / 0.6745 ≈ 2.965
|
||||||
|
fv := types.FeatureVector{
|
||||||
|
NormalizedVector: []float64{2.0},
|
||||||
|
}
|
||||||
|
res, _ := detector.Score(fv)
|
||||||
|
assert.InDelta(t, 2.965, res.Score, 0.1)
|
||||||
|
assert.False(t, res.IsAnomaly) // 2.96 < 3.5
|
||||||
|
|
||||||
|
// Feature vector with deviation of 3.0.
|
||||||
|
// score = 3.0 / 0.6745 ≈ 4.44
|
||||||
|
fv2 := types.FeatureVector{
|
||||||
|
NormalizedVector: []float64{3.0},
|
||||||
|
}
|
||||||
|
res2, _ := detector.Score(fv2)
|
||||||
|
assert.InDelta(t, 4.44, res2.Score, 0.1)
|
||||||
|
assert.True(t, res2.IsAnomaly)
|
||||||
|
assert.Contains(t, res2.Details, "identity prior")
|
||||||
|
}
|
||||||
173
internal/detect/rrcf.go
Normal file
173
internal/detect/rrcf.go
Normal file
|
|
@ -0,0 +1,173 @@
|
||||||
|
// Package detect provides anomaly detection algorithms and ensemble logic.
|
||||||
|
package detect
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"log"
|
||||||
|
"math"
|
||||||
|
"sync"
|
||||||
|
|
||||||
|
"codeberg.org/pata1704/guenther/pkg/types"
|
||||||
|
"codeberg.org/pata1704/rrcf"
|
||||||
|
)
|
||||||
|
|
||||||
|
// RRCFDetector wraps pkg/rrcf.Forest with the AnomalyDetector interface.
|
||||||
|
//
|
||||||
|
// Scoring strategy: score-then-insert (online streaming).
|
||||||
|
// Each call to Score:
|
||||||
|
// 1. Scores the point without inserting (ephemeral key – thread-safe).
|
||||||
|
// 2. Inserts the point permanently so the forest stays fresh.
|
||||||
|
type RRCFDetector struct {
|
||||||
|
mu sync.Mutex
|
||||||
|
forest *rrcf.Forest
|
||||||
|
|
||||||
|
thresholdPct float64
|
||||||
|
numTrees int
|
||||||
|
treeSize int
|
||||||
|
warmup int
|
||||||
|
counter int
|
||||||
|
buf []types.FeatureVector
|
||||||
|
|
||||||
|
// Rolling score window for adaptive threshold calculation.
|
||||||
|
// Uses a FIFO ring buffer; only scores after warmupDiscard are included.
|
||||||
|
scoreWindow *ringBuffer
|
||||||
|
warmupDiscard int // number of scores to discard after forest initialisation
|
||||||
|
scored int // total scores seen (including discarded)
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewRRCFDetector creates an RRCFDetector.
|
||||||
|
//
|
||||||
|
// - numTrees: number of trees in the forest (200 recommended).
|
||||||
|
// - treeSize: sliding-window capacity per tree (256 recommended).
|
||||||
|
// - warmup: vectors to buffer before first Score (pass 0 for immediate start).
|
||||||
|
// - thresholdPct: percentile of rolling score window used as threshold.
|
||||||
|
// E.g. 0.65 means: flag as anomaly if score > 65th percentile of recent scores.
|
||||||
|
//
|
||||||
|
// Internal defaults:
|
||||||
|
// - warmupDiscard = 10 (discard the first 10 scores; forest is not yet stable)
|
||||||
|
// - scoreWindowMax = 60
|
||||||
|
func NewRRCFDetector(numTrees, treeSize, warmup int, thresholdPct float64) *RRCFDetector {
|
||||||
|
return &RRCFDetector{
|
||||||
|
numTrees: numTrees,
|
||||||
|
treeSize: treeSize,
|
||||||
|
warmup: warmup,
|
||||||
|
thresholdPct: thresholdPct,
|
||||||
|
scoreWindow: newRingBuffer(60),
|
||||||
|
warmupDiscard: 10,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fit seeds the forest from a slice of FeatureVectors.
|
||||||
|
// It replaces any existing forest; the internal insert counter is reset.
|
||||||
|
func (d *RRCFDetector) Fit(vectors []types.FeatureVector) error {
|
||||||
|
if len(vectors) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
dim := len(vectors[0].NormalizedVector)
|
||||||
|
|
||||||
|
d.mu.Lock()
|
||||||
|
defer d.mu.Unlock()
|
||||||
|
|
||||||
|
d.forest = rrcf.NewForest(d.numTrees, dim, d.treeSize)
|
||||||
|
d.counter = 0
|
||||||
|
for _, v := range vectors {
|
||||||
|
if err := d.forest.Insert(v.NormalizedVector, d.counter); err != nil {
|
||||||
|
log.Printf("rrcf: fit insert: %v", err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
d.counter++
|
||||||
|
}
|
||||||
|
log.Printf("rrcf: forest seeded with %d points (trees=%d, treeSize=%d)",
|
||||||
|
len(vectors), d.numTrees, d.treeSize)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Score returns an AnomalyResult for vector.
|
||||||
|
// During the warmup phase (len(buf) < warmup) the vector is buffered and a
|
||||||
|
// zero-score result is returned.
|
||||||
|
func (d *RRCFDetector) Score(vector types.FeatureVector) (types.AnomalyResult, error) {
|
||||||
|
d.mu.Lock()
|
||||||
|
defer d.mu.Unlock()
|
||||||
|
|
||||||
|
// Lazy forest initialisation on the first Score call.
|
||||||
|
if d.forest == nil {
|
||||||
|
dim := len(vector.NormalizedVector)
|
||||||
|
d.forest = rrcf.NewForest(d.numTrees, dim, d.treeSize)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Warmup buffering.
|
||||||
|
if d.warmup > 0 && len(d.buf) < d.warmup {
|
||||||
|
d.buf = append(d.buf, vector)
|
||||||
|
if len(d.buf) == d.warmup {
|
||||||
|
for _, v := range d.buf {
|
||||||
|
_ = d.forest.Insert(v.NormalizedVector, d.counter)
|
||||||
|
d.counter++
|
||||||
|
}
|
||||||
|
d.buf = nil
|
||||||
|
log.Printf("rrcf: warmup complete (%d vectors)", d.warmup)
|
||||||
|
}
|
||||||
|
return types.AnomalyResult{
|
||||||
|
Timestamp: vector.Timestamp,
|
||||||
|
Score: 0,
|
||||||
|
IsAnomaly: false,
|
||||||
|
Method: "RRCF",
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Score via ephemeral insertion.
|
||||||
|
score, err := d.forest.Score(vector.NormalizedVector)
|
||||||
|
if err != nil {
|
||||||
|
return types.AnomalyResult{}, fmt.Errorf("rrcf: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Permanent streaming insert to keep the forest fresh.
|
||||||
|
if err := d.forest.Insert(vector.NormalizedVector, d.counter); err != nil {
|
||||||
|
log.Printf("rrcf: insert: %v", err)
|
||||||
|
}
|
||||||
|
d.counter++
|
||||||
|
d.scored++
|
||||||
|
|
||||||
|
// Discard the first warmupDiscard scores: the forest is still settling
|
||||||
|
// and scores are artificially high, which would anchor the threshold.
|
||||||
|
if d.scored <= d.warmupDiscard {
|
||||||
|
return types.AnomalyResult{
|
||||||
|
Timestamp: vector.Timestamp,
|
||||||
|
Score: score,
|
||||||
|
IsAnomaly: false,
|
||||||
|
Method: "RRCF",
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update rolling score window (ring buffer).
|
||||||
|
d.scoreWindow.push(score)
|
||||||
|
|
||||||
|
// Need at least 10 scores before making decisions.
|
||||||
|
isAnomaly := false
|
||||||
|
var threshold float64
|
||||||
|
if d.scoreWindow.size >= 10 {
|
||||||
|
threshold = d.rollingThreshold()
|
||||||
|
isAnomaly = score > threshold
|
||||||
|
}
|
||||||
|
|
||||||
|
confidence := 0.0
|
||||||
|
if threshold > 1e-9 {
|
||||||
|
confidence = math.Min(score/threshold, 1.0)
|
||||||
|
}
|
||||||
|
|
||||||
|
return types.AnomalyResult{
|
||||||
|
Timestamp: vector.Timestamp,
|
||||||
|
Score: score,
|
||||||
|
IsAnomaly: isAnomaly,
|
||||||
|
Confidence: confidence,
|
||||||
|
Method: "RRCF",
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// rollingThreshold returns the thresholdPct-quantile of the rolling score window.
|
||||||
|
// Caller must hold d.mu.
|
||||||
|
func (d *RRCFDetector) rollingThreshold() float64 {
|
||||||
|
return d.scoreWindow.quantileVal(d.thresholdPct)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update is a no-op for RRCF: insertion happens inside Score.
|
||||||
|
func (d *RRCFDetector) Update(_ types.FeatureVector) error { return nil }
|
||||||
299
internal/detect/scaling.go
Normal file
299
internal/detect/scaling.go
Normal file
|
|
@ -0,0 +1,299 @@
|
||||||
|
package detect
|
||||||
|
|
||||||
|
import (
|
||||||
|
"log"
|
||||||
|
"sync"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"codeberg.org/pata1704/guenther/pkg/types"
|
||||||
|
)
|
||||||
|
|
||||||
|
// ScalingLevel represents the current detector complexity level.
|
||||||
|
type ScalingLevel int
|
||||||
|
|
||||||
|
const (
|
||||||
|
LevelNormal ScalingLevel = iota // SEAD Ensemble (full accuracy)
|
||||||
|
LevelHigh // COPOD (reduced complexity)
|
||||||
|
LevelCritical // MAD (minimal overhead)
|
||||||
|
)
|
||||||
|
|
||||||
|
// levelName maps ScalingLevel to a human-readable string for logging.
|
||||||
|
var levelName = map[ScalingLevel]string{
|
||||||
|
LevelNormal: "SEAD Ensemble (Normal)",
|
||||||
|
LevelHigh: "COPOD (High Load)",
|
||||||
|
LevelCritical: "MAD (Critical Load)",
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── SwitchableDetector ───────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
// SwitchableDetector wraps a SEADDetector and allows runtime switching to
|
||||||
|
// lighter-weight sub-detectors (COPOD, MAD) under high CPU load.
|
||||||
|
//
|
||||||
|
// State consistency guarantee: all base detectors are kept up-to-date
|
||||||
|
// regardless of which one is currently active. This ensures a clean
|
||||||
|
// transition back to SEAD without stale internal state.
|
||||||
|
//
|
||||||
|
// Update-deduplication contract:
|
||||||
|
//
|
||||||
|
// SEAD.Score() calls d.Score() on every base detector, which self-updates.
|
||||||
|
// → no separate Update() call needed; doing so would double-count.
|
||||||
|
// SEAD.Update() calls d.Update() on every base detector directly.
|
||||||
|
// → used here when we need to advance inactive detectors
|
||||||
|
// without scoring through SEAD.
|
||||||
|
//
|
||||||
|
// For LevelHigh / LevelCritical we call:
|
||||||
|
//
|
||||||
|
// s.ensemble.Update(vector) → advances MAD, RRCF variants via d.Update()
|
||||||
|
// COPOD.Update() = COPOD.update() (buffer append only)
|
||||||
|
// active.Score(vector) → scores + self-updates the active detector
|
||||||
|
// (COPOD.Score calls update internally again)
|
||||||
|
//
|
||||||
|
// This means COPOD receives one Update() + one self-update from Score() per tick.
|
||||||
|
// That is intentional: Update() appends to the sliding window buffer; Score()
|
||||||
|
// computes the copula and then appends the scored point (score-then-insert).
|
||||||
|
// The two operations are not idempotent and must both run for correct behaviour.
|
||||||
|
// RRCF and MAD are updated via SEAD.Update() only; their Score() methods are
|
||||||
|
// not called when inactive so they do not double-count.
|
||||||
|
type SwitchableDetector struct {
|
||||||
|
mu sync.RWMutex
|
||||||
|
|
||||||
|
ensemble *SEADDetector
|
||||||
|
copod AnomalyDetector // may be nil if COPOD is not configured
|
||||||
|
mad AnomalyDetector // may be nil if MAD is not configured
|
||||||
|
|
||||||
|
activeLevel ScalingLevel
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewSwitchableDetector creates a SwitchableDetector backed by the given
|
||||||
|
// SEADDetector. COPOD and MAD sub-detectors are extracted from the ensemble
|
||||||
|
// for direct access during high-load switching.
|
||||||
|
//
|
||||||
|
// If a sub-detector is not present in the ensemble, the corresponding field
|
||||||
|
// is nil and Score() falls back to the ensemble for that level.
|
||||||
|
func NewSwitchableDetector(ensemble *SEADDetector) *SwitchableDetector {
|
||||||
|
return &SwitchableDetector{
|
||||||
|
ensemble: ensemble,
|
||||||
|
copod: ensemble.GetDetector("COPOD"),
|
||||||
|
mad: ensemble.GetDetector("MAD"),
|
||||||
|
activeLevel: LevelNormal,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fit trains all underlying detectors on the given baseline vectors.
|
||||||
|
func (s *SwitchableDetector) Fit(vectors []types.FeatureVector) error {
|
||||||
|
return s.ensemble.Fit(vectors)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update advances the internal state of all base detectors without scoring.
|
||||||
|
// Safe for concurrent use.
|
||||||
|
func (s *SwitchableDetector) Update(vector types.FeatureVector) error {
|
||||||
|
return s.ensemble.Update(vector)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Score returns an AnomalyResult from the currently active detector.
|
||||||
|
//
|
||||||
|
// All inactive detectors are kept current via SEAD.Update() so that
|
||||||
|
// switching back to a heavier detector does not produce stale scores.
|
||||||
|
// Safe for concurrent use.
|
||||||
|
func (s *SwitchableDetector) Score(vector types.FeatureVector) (types.AnomalyResult, error) {
|
||||||
|
s.mu.RLock()
|
||||||
|
level := s.activeLevel
|
||||||
|
s.mu.RUnlock()
|
||||||
|
|
||||||
|
// LevelNormal: SEAD.Score() handles everything internally.
|
||||||
|
// It scores all base detectors (which self-update) and applies
|
||||||
|
// MWU weight adaptation. No separate Update() needed.
|
||||||
|
if level == LevelNormal {
|
||||||
|
return s.ensemble.Score(vector)
|
||||||
|
}
|
||||||
|
|
||||||
|
// LevelHigh / LevelCritical:
|
||||||
|
// 1. Advance all base detectors via SEAD.Update() so inactive detectors
|
||||||
|
// (MAD, RRCF variants for LevelHigh; RRCF, COPOD for LevelCritical)
|
||||||
|
// maintain current state. SEAD weight adaptation is NOT performed here
|
||||||
|
// because we are bypassing SEAD.Score().
|
||||||
|
if err := s.ensemble.Update(vector); err != nil {
|
||||||
|
// Non-fatal: log and continue. A single missed update is acceptable;
|
||||||
|
// the detector will resync on the next tick.
|
||||||
|
log.Printf("scaling: ensemble update error at level %s: %v", levelName[level], err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// 2. Score via the active sub-detector.
|
||||||
|
// COPOD.Score() additionally self-updates (score-then-insert), which is
|
||||||
|
// correct and complementary to the Update() call above (see type doc).
|
||||||
|
// MAD.Update() internally calls Score(), so it is already current after
|
||||||
|
// the SEAD.Update() call; MAD.Score() here is pure scoring only.
|
||||||
|
switch level {
|
||||||
|
case LevelHigh:
|
||||||
|
if s.copod == nil {
|
||||||
|
log.Printf("scaling: COPOD unavailable at LevelHigh, falling back to ensemble")
|
||||||
|
return s.ensemble.Score(vector)
|
||||||
|
}
|
||||||
|
res, err := s.copod.Score(vector)
|
||||||
|
if err != nil {
|
||||||
|
return res, err
|
||||||
|
}
|
||||||
|
res.Method = "COPOD (High Load)"
|
||||||
|
return res, nil
|
||||||
|
|
||||||
|
case LevelCritical:
|
||||||
|
if s.mad == nil {
|
||||||
|
log.Printf("scaling: MAD unavailable at LevelCritical, falling back to ensemble")
|
||||||
|
return s.ensemble.Score(vector)
|
||||||
|
}
|
||||||
|
res, err := s.mad.Score(vector)
|
||||||
|
if err != nil {
|
||||||
|
return res, err
|
||||||
|
}
|
||||||
|
res.Method = "MAD (Critical Load)"
|
||||||
|
return res, nil
|
||||||
|
|
||||||
|
default:
|
||||||
|
return s.ensemble.Score(vector)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Switch atomically changes the active detection level.
|
||||||
|
// It is a no-op if the requested level equals the current level.
|
||||||
|
// Safe for concurrent use.
|
||||||
|
func (s *SwitchableDetector) Switch(level ScalingLevel) {
|
||||||
|
s.mu.Lock()
|
||||||
|
defer s.mu.Unlock()
|
||||||
|
|
||||||
|
if s.activeLevel == level {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
log.Printf("[SCALING] %s → %s", levelName[s.activeLevel], levelName[level])
|
||||||
|
s.activeLevel = level
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── ScalingController ────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
// ScalingController monitors CPU load and drives a SwitchableDetector through
|
||||||
|
// its scaling levels (Normal → High → Critical and back).
|
||||||
|
//
|
||||||
|
// Level transitions follow a two-phase commit pattern:
|
||||||
|
//
|
||||||
|
// 1. A CPU measurement moves the desired level to a "pending" state.
|
||||||
|
// 2. Only after the pending level has been stable for the configured
|
||||||
|
// duration is Switch() called on the detector.
|
||||||
|
//
|
||||||
|
// This prevents rapid oscillation under bursty workloads.
|
||||||
|
//
|
||||||
|
// Hysteresis rules (in the dead-band between downThres and highThres):
|
||||||
|
//
|
||||||
|
// Critical → High (one step down, not straight to Normal)
|
||||||
|
// High → High (stays until CPU drops below downThres)
|
||||||
|
// Normal → Normal
|
||||||
|
//
|
||||||
|
// ScalingController is not safe for concurrent use. ObserveCPU must be
|
||||||
|
// called from a single goroutine (the DetectionLayer's processing loop).
|
||||||
|
type ScalingController struct {
|
||||||
|
detector *SwitchableDetector
|
||||||
|
|
||||||
|
// Thresholds (CPU percent, 0–100)
|
||||||
|
highThres float64
|
||||||
|
critThres float64
|
||||||
|
downThres float64
|
||||||
|
|
||||||
|
// Required stable duration before a level transition is committed.
|
||||||
|
highDur time.Duration
|
||||||
|
critDur time.Duration
|
||||||
|
downDur time.Duration
|
||||||
|
|
||||||
|
// currentLevel is the level that has been committed to the detector.
|
||||||
|
currentLevel ScalingLevel
|
||||||
|
|
||||||
|
// pendingLevel is the desired level based on recent CPU measurements.
|
||||||
|
// It must remain stable for the corresponding duration before becoming current.
|
||||||
|
pendingLevel ScalingLevel
|
||||||
|
|
||||||
|
// pendingStart is the time at which pendingLevel last changed.
|
||||||
|
// The pending level is committed when time.Since(pendingStart) >= required duration.
|
||||||
|
pendingStart time.Time
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewScalingController constructs a ScalingController.
|
||||||
|
// Duration arguments are in seconds (float64 to match YAML config values).
|
||||||
|
func NewScalingController(
|
||||||
|
detector *SwitchableDetector,
|
||||||
|
highThres, critThres, downThres float64,
|
||||||
|
highDurSec, critDurSec, downDurSec float64,
|
||||||
|
) *ScalingController {
|
||||||
|
return &ScalingController{
|
||||||
|
detector: detector,
|
||||||
|
highThres: highThres,
|
||||||
|
critThres: critThres,
|
||||||
|
downThres: downThres,
|
||||||
|
highDur: time.Duration(highDurSec * float64(time.Second)),
|
||||||
|
critDur: time.Duration(critDurSec * float64(time.Second)),
|
||||||
|
downDur: time.Duration(downDurSec * float64(time.Second)),
|
||||||
|
currentLevel: LevelNormal,
|
||||||
|
pendingLevel: LevelNormal,
|
||||||
|
pendingStart: time.Now(), // explicit init avoids zero-time edge case
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ObserveCPU processes a single CPU measurement and, if warranted, triggers
|
||||||
|
// a level switch on the underlying SwitchableDetector.
|
||||||
|
//
|
||||||
|
// Must be called from a single goroutine only (not safe for concurrent use).
|
||||||
|
func (c *ScalingController) ObserveCPU(cpuPercent float64) {
|
||||||
|
now := time.Now()
|
||||||
|
|
||||||
|
desired := c.desiredLevel(cpuPercent)
|
||||||
|
|
||||||
|
// Phase 1: desired level changed → restart the stability timer.
|
||||||
|
if desired != c.pendingLevel {
|
||||||
|
c.pendingLevel = desired
|
||||||
|
c.pendingStart = now
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Phase 2: desired level has been stable – check if duration is met.
|
||||||
|
if now.Sub(c.pendingStart) < c.durationFor(desired) {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if desired != c.currentLevel {
|
||||||
|
c.currentLevel = desired
|
||||||
|
c.detector.Switch(desired)
|
||||||
|
}
|
||||||
|
c.pendingStart = now
|
||||||
|
}
|
||||||
|
|
||||||
|
// desiredLevel computes the target ScalingLevel for a given CPU measurement,
|
||||||
|
// applying hysteresis in the dead-band between downThres and highThres.
|
||||||
|
func (c *ScalingController) desiredLevel(cpuPercent float64) ScalingLevel {
|
||||||
|
switch {
|
||||||
|
case cpuPercent > c.critThres:
|
||||||
|
return LevelCritical
|
||||||
|
case cpuPercent > c.highThres:
|
||||||
|
return LevelHigh
|
||||||
|
case cpuPercent < c.downThres:
|
||||||
|
return LevelNormal
|
||||||
|
default:
|
||||||
|
// Dead-band: degrade at most one step to avoid jumping straight
|
||||||
|
// from Critical to Normal on a brief CPU dip.
|
||||||
|
switch c.currentLevel {
|
||||||
|
case LevelCritical:
|
||||||
|
return LevelHigh
|
||||||
|
case LevelHigh:
|
||||||
|
return LevelHigh
|
||||||
|
default:
|
||||||
|
return LevelNormal
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// durationFor returns the required stable duration for a given target level.
|
||||||
|
func (c *ScalingController) durationFor(level ScalingLevel) time.Duration {
|
||||||
|
switch level {
|
||||||
|
case LevelCritical:
|
||||||
|
return c.critDur
|
||||||
|
case LevelHigh:
|
||||||
|
return c.highDur
|
||||||
|
default:
|
||||||
|
return c.downDur
|
||||||
|
}
|
||||||
|
}
|
||||||
507
internal/detect/sead.go
Normal file
507
internal/detect/sead.go
Normal file
|
|
@ -0,0 +1,507 @@
|
||||||
|
// Package detect provides anomaly detection algorithms and ensemble logic.
|
||||||
|
package detect
|
||||||
|
|
||||||
|
// sead.go – SEAD: Unsupervised Ensemble of Streaming Anomaly Detectors
|
||||||
|
//
|
||||||
|
// Implementation of Algorithm 1 from:
|
||||||
|
// Shah et al. "SEAD: Unsupervised Ensemble of Streaming Anomaly Detectors"
|
||||||
|
// ICML 2025, Amazon Science.
|
||||||
|
//
|
||||||
|
// Core algorithm (Multiplicative Weights Update / FTRL with KL-divergence):
|
||||||
|
//
|
||||||
|
// 1. For each incoming feature vector x_t:
|
||||||
|
// a. Score every base detector: s̃_i(t) = A_i(x_t)
|
||||||
|
// b. Normalise to [0,1] via streaming quantile: s_i(t) = Q(s̃_i(t); history_i)
|
||||||
|
// c. Compute softmax weights: p_i(t) = exp(w_i) / Σ exp(w_j)
|
||||||
|
// d. Output combined score: S_t = Σ p_i(t) · s_i(t)
|
||||||
|
// e. Update weights: w_i(t+1) = w_i(t) − η · ∂L_t/∂w_i
|
||||||
|
// where L_t = S_t + λ · KL(p || π)
|
||||||
|
// 2. Update each base detector: A_i(t+1) ← Update(A_i(t), x_t)
|
||||||
|
//
|
||||||
|
// Streaming quantiles are approximated via a fixed-capacity sorted circular
|
||||||
|
// buffer (lightweight t-digest substitute). For N=4 detectors at 1 Hz this
|
||||||
|
// is negligible memory and CPU overhead.
|
||||||
|
//
|
||||||
|
// SEAD runs parallel to the existing AVG/MAX/MEDIAN ensemble; it is selected
|
||||||
|
// by setting detector.ensemble.method = "sead" in the config.
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"math"
|
||||||
|
"sort"
|
||||||
|
"strings"
|
||||||
|
"sync"
|
||||||
|
|
||||||
|
"codeberg.org/pata1704/guenther/pkg/types"
|
||||||
|
)
|
||||||
|
|
||||||
|
// ─── FIFO Ring Buffer ─────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
// ringBuffer is a fixed-capacity circular buffer with true FIFO eviction.
|
||||||
|
//
|
||||||
|
// Memory: O(cap · 8 bytes). For cap=500 this is 4 KB per detector
|
||||||
|
type ringBuffer struct {
|
||||||
|
data []float64
|
||||||
|
head int // index of the next write position
|
||||||
|
size int // current number of elements
|
||||||
|
cap int
|
||||||
|
}
|
||||||
|
|
||||||
|
func newRingBuffer(capacity int) *ringBuffer {
|
||||||
|
if capacity < 10 {
|
||||||
|
capacity = 10
|
||||||
|
}
|
||||||
|
return &ringBuffer{
|
||||||
|
data: make([]float64, capacity),
|
||||||
|
cap: capacity,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// push inserts v, overwriting the oldest entry when the buffer is full.
|
||||||
|
// Returns the empirical quantile rank of v within the current window ∈ [0,1].
|
||||||
|
func (r *ringBuffer) push(v float64) float64 {
|
||||||
|
r.data[r.head] = v
|
||||||
|
r.head = (r.head + 1) % r.cap
|
||||||
|
if r.size < r.cap {
|
||||||
|
r.size++
|
||||||
|
}
|
||||||
|
|
||||||
|
n := r.size
|
||||||
|
if n <= 1 {
|
||||||
|
return 0.5
|
||||||
|
}
|
||||||
|
|
||||||
|
sorted := make([]float64, n)
|
||||||
|
for i := range n {
|
||||||
|
sorted[i] = r.data[(r.head-n+i+r.cap)%r.cap]
|
||||||
|
}
|
||||||
|
sort.Float64s(sorted)
|
||||||
|
|
||||||
|
rank := sort.SearchFloat64s(sorted, v)
|
||||||
|
return float64(rank) / float64(n-1)
|
||||||
|
}
|
||||||
|
|
||||||
|
// quantileVal returns the value at quantile p ∈ [0,1] without modifying the buffer.
|
||||||
|
func (r *ringBuffer) quantileVal(p float64) float64 {
|
||||||
|
n := r.size
|
||||||
|
if n == 0 {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
sorted := make([]float64, n)
|
||||||
|
for i := range n {
|
||||||
|
sorted[i] = r.data[(r.head-n+i+r.cap)%r.cap]
|
||||||
|
}
|
||||||
|
sort.Float64s(sorted)
|
||||||
|
idx := int(p * float64(n-1))
|
||||||
|
if idx >= n {
|
||||||
|
idx = n - 1
|
||||||
|
}
|
||||||
|
return sorted[idx]
|
||||||
|
}
|
||||||
|
|
||||||
|
// streamQuantile is an alias kept for API compatibility.
|
||||||
|
// New code should use ringBuffer directly.
|
||||||
|
type streamQuantile = ringBuffer
|
||||||
|
|
||||||
|
func newStreamQuantile(capacity int) *ringBuffer {
|
||||||
|
return newRingBuffer(capacity)
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── SEADDetector ─────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
// SEADDetector implements the SEAD algorithm: an unsupervised online ensemble
|
||||||
|
// that adaptively weights N base anomaly detectors using Multiplicative Weights
|
||||||
|
// Update (MWU / FTRL with KL-divergence regulariser).
|
||||||
|
//
|
||||||
|
// Key properties:
|
||||||
|
// - Fully unsupervised: no anomaly labels required.
|
||||||
|
// - O(1) per time step: computational cost does not grow with stream length.
|
||||||
|
// - Adaptive: detector weights shift as data distribution changes.
|
||||||
|
// - Score-scale agnostic: all base scores are quantile-normalised to [0,1]
|
||||||
|
// before aggregation, preventing any single detector from dominating due
|
||||||
|
// to score magnitude differences.
|
||||||
|
//
|
||||||
|
// Configuration:
|
||||||
|
// - eta (η): MWU learning rate. Larger → faster adaptation, more noise.
|
||||||
|
// Recommended range: [0.05, 0.3]. Default: 0.1.
|
||||||
|
// - lambda (λ): KL-divergence regularisation strength. 0 = pure MWU (uniform
|
||||||
|
// prior). Positive values pull weights toward π (uniform). Default: 0.01.
|
||||||
|
// - quantileWindow: number of past scores retained per detector for quantile
|
||||||
|
// normalisation. Default: 300.
|
||||||
|
// - contamination: expected anomaly fraction used to set the decision
|
||||||
|
// threshold as quantile(combinedHistory, 1-contamination). Default: 0.15.
|
||||||
|
// - minDataPoints: minimum scored windows before any anomaly is flagged.
|
||||||
|
type SEADDetector struct {
|
||||||
|
detectors []AnomalyDetector // N base detectors (MAD, RRCF, COPOD, IForest)
|
||||||
|
names []string // human-readable name per detector
|
||||||
|
|
||||||
|
// MWU state
|
||||||
|
weights []float64 // w_i (log-space, unconstrained)
|
||||||
|
eta float64 // learning rate η
|
||||||
|
lambda float64 // KL regularisation strength λ
|
||||||
|
prior []float64 // π – uniform by default
|
||||||
|
|
||||||
|
// Streaming quantile per detector
|
||||||
|
quantiles []*streamQuantile
|
||||||
|
|
||||||
|
// Combined score history for threshold computation
|
||||||
|
// Uses a FIFO ring buffer (capacity: historySize) so every score lives
|
||||||
|
// exactly historySize time steps, regardless of its magnitude.
|
||||||
|
contamination float64
|
||||||
|
combinedHistory *ringBuffer // FIFO ring buffer, capacity=1000
|
||||||
|
minDataPoints int
|
||||||
|
|
||||||
|
mu sync.Mutex
|
||||||
|
}
|
||||||
|
|
||||||
|
// SEADConfig holds all tunable parameters for the SEAD ensemble.
|
||||||
|
type SEADConfig struct {
|
||||||
|
// Eta is the MWU learning rate η.
|
||||||
|
// Higher values react faster to distribution shifts but are noisier.
|
||||||
|
// Recommended: 0.05–0.20. Default: 0.10.
|
||||||
|
Eta float64
|
||||||
|
|
||||||
|
// Lambda is the KL-divergence regularisation strength.
|
||||||
|
// 0 = pure MWU (no penalty for deviation from prior).
|
||||||
|
// Positive values add stability; use 0.01–0.05.
|
||||||
|
Lambda float64
|
||||||
|
|
||||||
|
// QuantileWindow is the number of past scores retained per detector.
|
||||||
|
// Larger → more stable quantiles but slower adaptation.
|
||||||
|
// Default: 300.
|
||||||
|
QuantileWindow int
|
||||||
|
|
||||||
|
// Contamination is the expected anomaly fraction ∈ [0, 0.5).
|
||||||
|
// Sets the decision threshold at quantile(1-contamination) of combined history.
|
||||||
|
// Default: 0.15.
|
||||||
|
Contamination float64
|
||||||
|
|
||||||
|
// MinDataPoints is the cold-start guard: anomalies are not flagged until
|
||||||
|
// at least this many windows have been scored. Default: 20.
|
||||||
|
MinDataPoints int
|
||||||
|
}
|
||||||
|
|
||||||
|
// DefaultSEADConfig returns sensible defaults for the SEAD ensemble.
|
||||||
|
func DefaultSEADConfig() SEADConfig {
|
||||||
|
return SEADConfig{
|
||||||
|
Eta: 0.10,
|
||||||
|
Lambda: 0.01,
|
||||||
|
QuantileWindow: 300,
|
||||||
|
Contamination: 0.15,
|
||||||
|
MinDataPoints: 20,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewSEADDetector constructs a SEAD ensemble from N base detectors.
|
||||||
|
//
|
||||||
|
// - detectors: slice of base AnomalyDetector implementations. Must be ≥ 1.
|
||||||
|
// - names: human-readable labels for each detector (used in Details field).
|
||||||
|
// - cfg: SEAD tuning parameters (use DefaultSEADConfig() for a safe start).
|
||||||
|
func NewSEADDetector(
|
||||||
|
detectors []AnomalyDetector,
|
||||||
|
names []string,
|
||||||
|
cfg SEADConfig,
|
||||||
|
) (*SEADDetector, error) {
|
||||||
|
n := len(detectors)
|
||||||
|
if n == 0 {
|
||||||
|
return nil, fmt.Errorf("sead: at least one base detector required")
|
||||||
|
}
|
||||||
|
if len(names) != n {
|
||||||
|
return nil, fmt.Errorf("sead: names length %d must match detectors length %d", len(names), n)
|
||||||
|
}
|
||||||
|
|
||||||
|
if cfg.Eta <= 0 {
|
||||||
|
cfg.Eta = 0.10
|
||||||
|
}
|
||||||
|
if cfg.QuantileWindow <= 0 {
|
||||||
|
cfg.QuantileWindow = 300
|
||||||
|
}
|
||||||
|
if cfg.Contamination <= 0 || cfg.Contamination >= 0.5 {
|
||||||
|
cfg.Contamination = 0.15
|
||||||
|
}
|
||||||
|
if cfg.MinDataPoints <= 0 {
|
||||||
|
cfg.MinDataPoints = 20
|
||||||
|
}
|
||||||
|
|
||||||
|
// Uniform prior π = 1/N for all detectors.
|
||||||
|
prior := make([]float64, n)
|
||||||
|
for i := range prior {
|
||||||
|
prior[i] = 1.0 / float64(n)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Initialise weights uniformly in log-space: w_i = 0 → softmax = 1/N.
|
||||||
|
weights := make([]float64, n)
|
||||||
|
|
||||||
|
quantiles := make([]*streamQuantile, n)
|
||||||
|
for i := range quantiles {
|
||||||
|
quantiles[i] = newStreamQuantile(cfg.QuantileWindow)
|
||||||
|
}
|
||||||
|
|
||||||
|
return &SEADDetector{
|
||||||
|
detectors: detectors,
|
||||||
|
names: names,
|
||||||
|
weights: weights,
|
||||||
|
eta: cfg.Eta,
|
||||||
|
lambda: cfg.Lambda,
|
||||||
|
prior: prior,
|
||||||
|
quantiles: quantiles,
|
||||||
|
contamination: cfg.Contamination,
|
||||||
|
combinedHistory: newRingBuffer(1000),
|
||||||
|
minDataPoints: cfg.MinDataPoints,
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fit seeds all base detectors from labelled-normal vectors.
|
||||||
|
// SEAD itself has no training phase; only the base detectors are fitted.
|
||||||
|
func (s *SEADDetector) Fit(vectors []types.FeatureVector) error {
|
||||||
|
for i, d := range s.detectors {
|
||||||
|
if err := d.Fit(vectors); err != nil {
|
||||||
|
return fmt.Errorf("sead: fit detector %q: %w", s.names[i], err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update propagates the feature vector to all base detectors.
|
||||||
|
func (s *SEADDetector) Update(vector types.FeatureVector) error {
|
||||||
|
for i, d := range s.detectors {
|
||||||
|
if err := d.Update(vector); err != nil {
|
||||||
|
return fmt.Errorf("sead: update detector %q: %w", s.names[i], err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Score implements Algorithm 1 from the SEAD paper.
|
||||||
|
//
|
||||||
|
// Steps:
|
||||||
|
// 1. Score each base detector → raw scores s̃_i.
|
||||||
|
// Each detector also self-updates its internal state (RRCF inserts
|
||||||
|
// the point into the forest; COPOD appends to its copula buffer;
|
||||||
|
// IForest adds to its retraining buffer; MAD buffers for calibration).
|
||||||
|
// 2. Quantile-normalise each s̃_i to ŝ_i ∈ [0,1] via streaming window.
|
||||||
|
// 3. Compute softmax weights p_i = exp(w_i) / Σ exp(w_j).
|
||||||
|
// 4. Combined score S = Σ p_i · ŝ_i.
|
||||||
|
// 5. Update weights: w_i -= η · ∂L/∂w_i
|
||||||
|
// where L = S + λ · KL(p || π).
|
||||||
|
// 6. Threshold S against rolling (1-contamination)-quantile of S history.
|
||||||
|
func (s *SEADDetector) Score(vector types.FeatureVector) (types.AnomalyResult, error) {
|
||||||
|
n := len(s.detectors)
|
||||||
|
|
||||||
|
// ── Step 1: Score all base detectors ──────────────────────────────────────
|
||||||
|
// Each detector's Score method is responsible for self-updating (RRCF inserts
|
||||||
|
// into its forest; COPOD appends to its copula buffer; etc.). We do NOT call
|
||||||
|
// d.Update separately here to avoid double-counting in detectors that already
|
||||||
|
// self-update inside Score.
|
||||||
|
rawScores := make([]float64, n)
|
||||||
|
anomalyFlags := make([]bool, n)
|
||||||
|
for i, d := range s.detectors {
|
||||||
|
res, err := d.Score(vector)
|
||||||
|
if err != nil {
|
||||||
|
// Degrade gracefully: treat failed detector as neutral (score=0.5).
|
||||||
|
rawScores[i] = 0.5
|
||||||
|
} else {
|
||||||
|
rawScores[i] = res.Score
|
||||||
|
anomalyFlags[i] = res.IsAnomaly
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
s.mu.Lock()
|
||||||
|
defer s.mu.Unlock()
|
||||||
|
|
||||||
|
// ── Step 2: Quantile-normalise scores to [0,1] ────────────────────────────
|
||||||
|
normScores := make([]float64, n)
|
||||||
|
for i, raw := range rawScores {
|
||||||
|
normScores[i] = s.quantiles[i].push(raw)
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Step 3: Softmax weights ───────────────────────────────────────────────
|
||||||
|
p := softmax(s.weights)
|
||||||
|
|
||||||
|
// ── Step 4: Combined score ────────────────────────────────────────────────
|
||||||
|
combined := 0.0
|
||||||
|
for i := range p {
|
||||||
|
combined += p[i] * normScores[i]
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Step 5: Weight update (MWU gradient step) ─────────────────────────────
|
||||||
|
// Loss L(w) = combined(w) + λ · KL(softmax(w) || π)
|
||||||
|
// ∂L/∂w_i = p_i · (ŝ_i - combined) + λ · (p_i - π_i)
|
||||||
|
//
|
||||||
|
// This is the closed-form gradient for softmax + weighted sum + KL penalty.
|
||||||
|
for i := range s.weights {
|
||||||
|
gradCombined := p[i] * (normScores[i] - combined)
|
||||||
|
gradKL := s.lambda * (p[i] - s.prior[i])
|
||||||
|
s.weights[i] -= s.eta * (gradCombined + gradKL)
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Step 6: Threshold decision ────────────────────────────────────────────
|
||||||
|
// Use FIFO ring buffer: oldest score is evicted automatically after
|
||||||
|
// 1000 time steps, giving the threshold a finite, sliding memory.
|
||||||
|
s.combinedHistory.push(combined)
|
||||||
|
threshold := s.combinedHistory.quantileVal(1.0 - s.contamination)
|
||||||
|
isAnomaly := s.combinedHistory.size > s.minDataPoints && combined > threshold
|
||||||
|
|
||||||
|
confidence := 0.0
|
||||||
|
if threshold > 1e-9 {
|
||||||
|
confidence = math.Min(combined/threshold, 1.0)
|
||||||
|
}
|
||||||
|
|
||||||
|
return types.AnomalyResult{
|
||||||
|
Timestamp: vector.Timestamp,
|
||||||
|
Score: combined,
|
||||||
|
IsAnomaly: isAnomaly,
|
||||||
|
Confidence: confidence,
|
||||||
|
Method: "SEAD",
|
||||||
|
Details: s.detailString(p, normScores, anomalyFlags),
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetDetector returns a base detector by name. Returns nil if not found.
|
||||||
|
func (s *SEADDetector) GetDetector(name string) AnomalyDetector {
|
||||||
|
s.mu.Lock()
|
||||||
|
defer s.mu.Unlock()
|
||||||
|
for i, n := range s.names {
|
||||||
|
if n == name {
|
||||||
|
return s.detectors[i]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Weights returns a copy of the current softmax-normalised detector weights.
|
||||||
|
// Useful for logging and diagnostics. Thread-safe.
|
||||||
|
func (s *SEADDetector) Weights() []float64 {
|
||||||
|
s.mu.Lock()
|
||||||
|
defer s.mu.Unlock()
|
||||||
|
return softmax(s.weights)
|
||||||
|
}
|
||||||
|
|
||||||
|
// WeightSummary returns a human-readable string of detector weights.
|
||||||
|
func (s *SEADDetector) WeightSummary() string {
|
||||||
|
w := s.Weights()
|
||||||
|
var sb strings.Builder
|
||||||
|
for i, name := range s.names {
|
||||||
|
if i > 0 {
|
||||||
|
sb.WriteString(" | ")
|
||||||
|
}
|
||||||
|
sb.WriteString(fmt.Sprintf("%s=%.3f", name, w[i]))
|
||||||
|
}
|
||||||
|
return sb.String()
|
||||||
|
}
|
||||||
|
|
||||||
|
// detailString builds a diagnostic annotation for AnomalyResult.Details.
|
||||||
|
// Caller must hold s.mu.
|
||||||
|
func (s *SEADDetector) detailString(p, normScores []float64, flags []bool) string {
|
||||||
|
var parts []string
|
||||||
|
for i, name := range s.names {
|
||||||
|
flag := ""
|
||||||
|
if flags[i] {
|
||||||
|
flag = "!"
|
||||||
|
}
|
||||||
|
parts = append(parts, fmt.Sprintf("%s%s:w=%.2f,s=%.2f", name, flag, p[i], normScores[i]))
|
||||||
|
}
|
||||||
|
return strings.Join(parts, " ")
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── Math helpers ─────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
// softmax returns exp(w_i) / Σ exp(w_j) with numerical stability (max subtraction).
|
||||||
|
func softmax(w []float64) []float64 {
|
||||||
|
maxW := w[0]
|
||||||
|
for _, v := range w[1:] {
|
||||||
|
if v > maxW {
|
||||||
|
maxW = v
|
||||||
|
}
|
||||||
|
}
|
||||||
|
out := make([]float64, len(w))
|
||||||
|
var sum float64
|
||||||
|
for i, v := range w {
|
||||||
|
out[i] = math.Exp(v - maxW)
|
||||||
|
sum += out[i]
|
||||||
|
}
|
||||||
|
for i := range out {
|
||||||
|
out[i] /= sum
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── Factory helpers ──────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
// NewSEADWithAllDetectors constructs a SEAD ensemble from six base detectors:
|
||||||
|
// MAD, RRCF-fast, RRCF-mid, RRCF-slow, COPOD, IsolationForest.
|
||||||
|
//
|
||||||
|
// SEAD's MWU weight-update naturally up-weights the variant that consistently
|
||||||
|
// separates anomalies from normal windows, and adapts when the stream
|
||||||
|
// distribution shifts (e.g. time-of-day effects).
|
||||||
|
//
|
||||||
|
// MAD auto-calibration: the MADDetector buffers the first madCalibSize
|
||||||
|
// NormalizedVectors, derives per-feature median and MAD, and starts scoring
|
||||||
|
// once calibration is complete. Calibration requires no external tooling.
|
||||||
|
// SEAD down-weights MAD automatically during the warmup phase.
|
||||||
|
func NewSEADWithAllDetectors(
|
||||||
|
copodBufferSize int, copodThreshold float64,
|
||||||
|
rrcfVariants RRCFVariantsConfig,
|
||||||
|
madThreshold float64, madCalibSize int,
|
||||||
|
seadCfg SEADConfig,
|
||||||
|
) (*SEADDetector, error) {
|
||||||
|
if rrcfVariants.Fast.NumTrees == 0 {
|
||||||
|
rrcfVariants.Fast.NumTrees = 50
|
||||||
|
}
|
||||||
|
if rrcfVariants.Fast.TreeSize == 0 {
|
||||||
|
rrcfVariants.Fast.TreeSize = 32
|
||||||
|
}
|
||||||
|
if rrcfVariants.Fast.ThresholdPercentile == 0 {
|
||||||
|
rrcfVariants.Fast.ThresholdPercentile = 0.85
|
||||||
|
}
|
||||||
|
|
||||||
|
if rrcfVariants.Mid.NumTrees == 0 {
|
||||||
|
rrcfVariants.Mid.NumTrees = 150
|
||||||
|
}
|
||||||
|
if rrcfVariants.Mid.TreeSize == 0 {
|
||||||
|
rrcfVariants.Mid.TreeSize = 64
|
||||||
|
}
|
||||||
|
if rrcfVariants.Mid.ThresholdPercentile == 0 {
|
||||||
|
rrcfVariants.Mid.ThresholdPercentile = 0.85
|
||||||
|
}
|
||||||
|
|
||||||
|
if rrcfVariants.Slow.NumTrees == 0 {
|
||||||
|
rrcfVariants.Slow.NumTrees = 200
|
||||||
|
}
|
||||||
|
if rrcfVariants.Slow.TreeSize == 0 {
|
||||||
|
rrcfVariants.Slow.TreeSize = 128
|
||||||
|
}
|
||||||
|
if rrcfVariants.Slow.ThresholdPercentile == 0 {
|
||||||
|
rrcfVariants.Slow.ThresholdPercentile = 0.85
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Construct base detectors ──────────────────────────────────────────────
|
||||||
|
copod, err := NewCOPODDetector(copodBufferSize, copodThreshold)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("sead: copod: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
rrcfFast := NewRRCFDetector(
|
||||||
|
rrcfVariants.Fast.NumTrees, rrcfVariants.Fast.TreeSize,
|
||||||
|
0, rrcfVariants.Fast.ThresholdPercentile,
|
||||||
|
)
|
||||||
|
rrcfMid := NewRRCFDetector(
|
||||||
|
rrcfVariants.Mid.NumTrees, rrcfVariants.Mid.TreeSize,
|
||||||
|
0, rrcfVariants.Mid.ThresholdPercentile,
|
||||||
|
)
|
||||||
|
rrcfSlow := NewRRCFDetector(
|
||||||
|
rrcfVariants.Slow.NumTrees, rrcfVariants.Slow.TreeSize,
|
||||||
|
0, rrcfVariants.Slow.ThresholdPercentile,
|
||||||
|
)
|
||||||
|
|
||||||
|
if madCalibSize <= 0 {
|
||||||
|
madCalibSize = 100
|
||||||
|
}
|
||||||
|
mad := NewMADDetectorAutoCalibrate(madThreshold, madCalibSize)
|
||||||
|
|
||||||
|
return NewSEADDetector(
|
||||||
|
[]AnomalyDetector{mad, rrcfFast, rrcfMid, rrcfSlow, copod},
|
||||||
|
[]string{"MAD", "RRCF-fast", "RRCF-mid", "RRCF-slow", "COPOD"},
|
||||||
|
seadCfg,
|
||||||
|
)
|
||||||
|
}
|
||||||
61
internal/detect/sead_test.go
Normal file
61
internal/detect/sead_test.go
Normal file
|
|
@ -0,0 +1,61 @@
|
||||||
|
package detect
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/stretchr/testify/assert"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestRingBuffer_FIFO(t *testing.T) {
|
||||||
|
// 1. Initialize with capacity 10
|
||||||
|
rb := newRingBuffer(10)
|
||||||
|
assert.Equal(t, 10, rb.cap)
|
||||||
|
assert.Equal(t, 0, rb.size)
|
||||||
|
|
||||||
|
// 2. Fill it up
|
||||||
|
for i := 1; i <= 10; i++ {
|
||||||
|
rb.push(float64(i))
|
||||||
|
}
|
||||||
|
assert.Equal(t, 10, rb.size)
|
||||||
|
// head should be at 0 after 10 pushes
|
||||||
|
assert.Equal(t, 0, rb.head)
|
||||||
|
|
||||||
|
// 3. Verify quantile (sorted view)
|
||||||
|
// sorted: [1 2 3 4 5 6 7 8 9 10]
|
||||||
|
// quantile 0.5 (median) of 10 items: index int(0.5 * 9) = 4 -> value 5
|
||||||
|
assert.Equal(t, 5.0, rb.quantileVal(0.5))
|
||||||
|
|
||||||
|
// 4. Push one more to trigger FIFO eviction
|
||||||
|
// Should evict "1" (the oldest)
|
||||||
|
rb.push(11.0)
|
||||||
|
assert.Equal(t, 10, rb.size)
|
||||||
|
assert.Equal(t, 1, rb.head)
|
||||||
|
|
||||||
|
// 5. Verify the oldest (1.0) is gone and 11.0 is present
|
||||||
|
// sorted: [2 3 4 5 6 7 8 9 10 11]
|
||||||
|
// idx = int(0.4 * 9) = 3 -> value at index 3 is 5.0
|
||||||
|
assert.Equal(t, 5.0, rb.quantileVal(0.4))
|
||||||
|
// let's be precise: idx = int(p * 9)
|
||||||
|
// p=0 -> idx 0 (2.0)
|
||||||
|
// p=1 -> idx 9 (11.0)
|
||||||
|
assert.Equal(t, 2.0, rb.quantileVal(0.0))
|
||||||
|
assert.Equal(t, 11.0, rb.quantileVal(1.0))
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRingBuffer_Rank(t *testing.T) {
|
||||||
|
rb := newRingBuffer(5)
|
||||||
|
|
||||||
|
// Rank is float64(rank) / float64(n-1)
|
||||||
|
assert.Equal(t, 0.5, rb.push(10.0)) // n=1 -> 0.5
|
||||||
|
assert.Equal(t, 1.0, rb.push(20.0)) // n=2, sorted=[10, 20], search(20)->1. 1/(2-1)=1.0
|
||||||
|
assert.Equal(t, 0.0, rb.push(5.0)) // n=3, sorted=[5, 10, 20], search(5)->0. 0/2=0.0
|
||||||
|
|
||||||
|
// n=4, sorted=[5 10 10 20], search(10) -> idx 1. 1/(4-1) = 0.333...
|
||||||
|
assert.InDelta(t, 0.3333333333333333, rb.push(10.0), 1e-9)
|
||||||
|
|
||||||
|
rb = newRingBuffer(4)
|
||||||
|
rb.push(1.0)
|
||||||
|
rb.push(3.0)
|
||||||
|
rank := rb.push(2.0) // n=3, sorted=[1, 2, 3], search(2)->idx 1. 1/(3-1)=0.5
|
||||||
|
assert.Equal(t, 0.5, rank)
|
||||||
|
}
|
||||||
32
internal/drain3/masking.go
Normal file
32
internal/drain3/masking.go
Normal file
|
|
@ -0,0 +1,32 @@
|
||||||
|
// Package drain3 provides log stripping via regex-based masking templates which
|
||||||
|
// sits in front of Drain3 template mining.
|
||||||
|
package drain3
|
||||||
|
|
||||||
|
import (
|
||||||
|
"codeberg.org/pata1704/guenther/internal/config"
|
||||||
|
)
|
||||||
|
|
||||||
|
// ApplyMasking applies all MaskingPatterns sequentially to line.
|
||||||
|
//
|
||||||
|
// For each pattern with a non-empty Name, capture group 1 of the regex is
|
||||||
|
// stored in params before the match is replaced with mp.Replace.
|
||||||
|
// Patterns without a Name only mask; they never write to params.
|
||||||
|
//
|
||||||
|
// All patterns are pre-compiled via config.Compile at startup;
|
||||||
|
// no compilation happens in this hot-path function.
|
||||||
|
func ApplyMasking(line string, patterns []config.MaskingPattern) (masked string, params map[string]string) {
|
||||||
|
params = make(map[string]string, len(patterns))
|
||||||
|
masked = line
|
||||||
|
for _, mp := range patterns {
|
||||||
|
if mp.Re == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if mp.Name != "" {
|
||||||
|
if m := mp.Re.FindStringSubmatch(masked); len(m) > 1 {
|
||||||
|
params[mp.Name] = m[1]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
masked = mp.Re.ReplaceAllString(masked, mp.Replace)
|
||||||
|
}
|
||||||
|
return masked, params
|
||||||
|
}
|
||||||
111
internal/health/monitor.go
Normal file
111
internal/health/monitor.go
Normal file
|
|
@ -0,0 +1,111 @@
|
||||||
|
package health
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"encoding/json"
|
||||||
|
"log"
|
||||||
|
"sync"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"codeberg.org/pata1704/guenther/pkg/types"
|
||||||
|
)
|
||||||
|
|
||||||
|
// HealthMonitor collects StageHealth snapshots from pipeline stages and
|
||||||
|
// periodically prints a JSON report to the standard logger.
|
||||||
|
//
|
||||||
|
// Stages write to the channel returned by Chan(). The channel is buffered
|
||||||
|
// (capacity 100) so health updates never block the sending stage.
|
||||||
|
//
|
||||||
|
// The channel is intentionally private (accessed via Chan()) so that callers
|
||||||
|
// cannot close it from outside and cannot see the internal buffer size.
|
||||||
|
type HealthMonitor struct {
|
||||||
|
healthChan chan types.StageHealth
|
||||||
|
|
||||||
|
mu sync.Mutex
|
||||||
|
stages map[string]*types.StageHealth
|
||||||
|
|
||||||
|
wg sync.WaitGroup
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewHealthMonitor allocates a HealthMonitor. Call Start to begin processing.
|
||||||
|
func NewHealthMonitor() *HealthMonitor {
|
||||||
|
return &HealthMonitor{
|
||||||
|
healthChan: make(chan types.StageHealth, 100),
|
||||||
|
stages: make(map[string]*types.StageHealth),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Chan returns the write-only channel that pipeline stages use to submit
|
||||||
|
// health updates. The channel remains open for the lifetime of the monitor.
|
||||||
|
func (m *HealthMonitor) Chan() chan<- types.StageHealth {
|
||||||
|
return m.healthChan
|
||||||
|
}
|
||||||
|
|
||||||
|
// Start begins the health collection loop and periodic reporting.
|
||||||
|
// interval controls how often the report is printed (typically 5 s).
|
||||||
|
func (m *HealthMonitor) Start(ctx context.Context, interval time.Duration) {
|
||||||
|
ticker := time.NewTicker(interval)
|
||||||
|
m.wg.Go(func() {
|
||||||
|
defer ticker.Stop()
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case h := <-m.healthChan:
|
||||||
|
m.mu.Lock()
|
||||||
|
// Shallow copy so the map owns the value.
|
||||||
|
snap := h
|
||||||
|
m.stages[h.StageName] = &snap
|
||||||
|
m.mu.Unlock()
|
||||||
|
|
||||||
|
case <-ticker.C:
|
||||||
|
m.printReport()
|
||||||
|
|
||||||
|
case <-ctx.Done():
|
||||||
|
// Drain remaining updates before exiting.
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case h := <-m.healthChan:
|
||||||
|
m.mu.Lock()
|
||||||
|
snap := h
|
||||||
|
m.stages[h.StageName] = &snap
|
||||||
|
m.mu.Unlock()
|
||||||
|
default:
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// Wait waits for the health monitor goroutine to exit after context cancellation.
|
||||||
|
func (m *HealthMonitor) Wait() {
|
||||||
|
m.wg.Wait()
|
||||||
|
}
|
||||||
|
|
||||||
|
// Snapshot returns a point-in-time copy of all stage health records.
|
||||||
|
// Useful for tests and metrics endpoints.
|
||||||
|
func (m *HealthMonitor) Snapshot() map[string]types.StageHealth {
|
||||||
|
m.mu.Lock()
|
||||||
|
defer m.mu.Unlock()
|
||||||
|
out := make(map[string]types.StageHealth, len(m.stages))
|
||||||
|
for k, v := range m.stages {
|
||||||
|
out[k] = *v
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *HealthMonitor) printReport() {
|
||||||
|
m.mu.Lock()
|
||||||
|
defer m.mu.Unlock()
|
||||||
|
|
||||||
|
log.Println("── Pipeline Health ──────────────────────────────")
|
||||||
|
for _, h := range m.stages {
|
||||||
|
b, err := json.Marshal(h)
|
||||||
|
if err != nil {
|
||||||
|
log.Printf("[%s] marshal error: %v", h.StageName, err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
log.Printf("[%s] %s", h.StageName, b)
|
||||||
|
}
|
||||||
|
log.Println("─────────────────────────────────────────────────")
|
||||||
|
}
|
||||||
1091
internal/transform/engine.go
Normal file
1091
internal/transform/engine.go
Normal file
File diff suppressed because it is too large
Load diff
106
internal/transform/engine_test.go
Normal file
106
internal/transform/engine_test.go
Normal file
|
|
@ -0,0 +1,106 @@
|
||||||
|
package transform
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"codeberg.org/pata1704/guenther/internal/config"
|
||||||
|
"codeberg.org/pata1704/guenther/pkg/types"
|
||||||
|
"github.com/stretchr/testify/assert"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestTransformEngine_Fusion(t *testing.T) {
|
||||||
|
logChan := make(chan types.LogEvent, 100)
|
||||||
|
metricChan := make(chan types.MetricSnapshot, 100)
|
||||||
|
serviceStatusChan := make(chan types.ServiceStatus, 100)
|
||||||
|
featureChan := make(chan types.FeatureVector, 100)
|
||||||
|
healthChan := make(chan types.StageHealth, 10)
|
||||||
|
|
||||||
|
cfg := &config.Config{}
|
||||||
|
cfg.Transformation.WindowSize = 1 * time.Second
|
||||||
|
cfg.Transformation.DbPath = ":memory:"
|
||||||
|
|
||||||
|
engine, err := NewTransformEngine(cfg, logChan, metricChan, serviceStatusChan, featureChan, healthChan)
|
||||||
|
assert.NoError(t, err)
|
||||||
|
|
||||||
|
baseTime := time.Date(2026, 1, 1, 12, 0, 0, 0, time.Local)
|
||||||
|
|
||||||
|
// 1. Send data for first window
|
||||||
|
metricChan <- types.MetricSnapshot{
|
||||||
|
Timestamp: baseTime,
|
||||||
|
CPUPercent: 50.0,
|
||||||
|
MemoryUsedMB: 1000,
|
||||||
|
MemoryDirtyMB: 100,
|
||||||
|
NetworkInMBps: 10.0,
|
||||||
|
NetworkOutMBps: 20.0,
|
||||||
|
TCPRetransPerS: 5,
|
||||||
|
NetPacketsInPerS: 100,
|
||||||
|
NetPacketsOutPerS: 200,
|
||||||
|
}
|
||||||
|
|
||||||
|
// 2. Start engine and wait for first window
|
||||||
|
ctx, cancel := context.WithCancel(context.Background())
|
||||||
|
engine.Start(ctx)
|
||||||
|
defer func() {
|
||||||
|
cancel()
|
||||||
|
engine.Wait()
|
||||||
|
}()
|
||||||
|
|
||||||
|
select {
|
||||||
|
case fv := <-featureChan:
|
||||||
|
assert.Equal(t, 50.0, fv.AvgCPUPercent)
|
||||||
|
// Deltas are absolute value on first window because tracker starts at 0
|
||||||
|
assert.Equal(t, 10.0, fv.DeltaNetIn)
|
||||||
|
case <-time.After(2 * time.Second):
|
||||||
|
t.Fatal("Timeout waiting for first FeatureVector")
|
||||||
|
}
|
||||||
|
|
||||||
|
// 3. Send data for second window (triggers deltas)
|
||||||
|
secondTime := baseTime.Add(cfg.Transformation.WindowSize)
|
||||||
|
metricChan <- types.MetricSnapshot{
|
||||||
|
Timestamp: secondTime,
|
||||||
|
CPUPercent: 60.0,
|
||||||
|
MemoryUsedMB: 1000,
|
||||||
|
MemoryDirtyMB: 200,
|
||||||
|
NetworkInMBps: 15.0, // DeltaNetIn = 15.0 - 10.0 = 5.0
|
||||||
|
NetworkOutMBps: 20.0,
|
||||||
|
TCPRetransPerS: 10, // DeltaTCPRetrans = 10.0 - 5.0 = 5.0
|
||||||
|
NetPacketsInPerS: 150,
|
||||||
|
NetPacketsOutPerS: 200,
|
||||||
|
}
|
||||||
|
|
||||||
|
select {
|
||||||
|
case fv := <-featureChan:
|
||||||
|
// Check original logic
|
||||||
|
assert.Equal(t, 60.0, fv.AvgCPUPercent)
|
||||||
|
|
||||||
|
// Check new delta features
|
||||||
|
assert.Equal(t, 5.0, fv.DeltaNetIn)
|
||||||
|
assert.Equal(t, 5.0, fv.DeltaTCPRetrans)
|
||||||
|
|
||||||
|
// Check ratio features
|
||||||
|
// MemPressure = dirty / (used + 1) = 200/1001
|
||||||
|
expectedPressure := 200.0 / 1001.0
|
||||||
|
assert.InDelta(t, expectedPressure, fv.MemPressure, 1e-9)
|
||||||
|
// NetAsymmetry = in / (out + 1e-3) = 15/20.001
|
||||||
|
expectedAsym := 15.0 / 20.001
|
||||||
|
assert.InDelta(t, expectedAsym, fv.NetAsymmetry, 1e-9)
|
||||||
|
|
||||||
|
// Check NormalizedVector length (should be 45 base + params)
|
||||||
|
assert.GreaterOrEqual(t, len(fv.NormalizedVector), 45)
|
||||||
|
|
||||||
|
// Verify slots 39-44 (Engineered Features tail)
|
||||||
|
nv := fv.NormalizedVector
|
||||||
|
assert.Equal(t, 5.0, nv[39]) // DeltaNetIn
|
||||||
|
assert.Equal(t, 5.0, nv[40]) // DeltaTCPRetrans
|
||||||
|
// TcpRollStd and NetRollStd will have values (even if just 2 pts)
|
||||||
|
assert.Greater(t, nv[41], 0.0) // TcpRollStd (10 and 5)
|
||||||
|
assert.Equal(t, 0.0, nv[42]) // NetRollStd (20 and 20 -> std=0)
|
||||||
|
assert.InDelta(t, expectedPressure, nv[43], 1e-9) // MemPressure
|
||||||
|
assert.InDelta(t, expectedAsym, nv[44], 1e-9) // NetAsymmetry
|
||||||
|
|
||||||
|
case <-time.After(2 * time.Second):
|
||||||
|
t.Fatal("Timeout waiting for second FeatureVector")
|
||||||
|
}
|
||||||
|
}
|
||||||
230
internal/transform/schema.go
Normal file
230
internal/transform/schema.go
Normal file
|
|
@ -0,0 +1,230 @@
|
||||||
|
// Package transform contains the DuckDB-backed Tumbling Window Engine.
|
||||||
|
package transform
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"codeberg.org/pata1704/guenther/internal/config"
|
||||||
|
)
|
||||||
|
|
||||||
|
// they are derived from already-scaled inputs or are ratio/delta features).
|
||||||
|
var scalerFeatureNames = []string{
|
||||||
|
// CPU (3)
|
||||||
|
"avg_cpu", "max_cpu", "std_cpu",
|
||||||
|
// System/Kernel (7)
|
||||||
|
"avg_iowait", "std_iowait", "avg_softirq", "avg_ctx_switches", "avg_interrupts", "avg_softnet_dropped", "avg_softnet_squeeze",
|
||||||
|
// Network (8)
|
||||||
|
"avg_net_in", "std_net_in", "avg_net_out", "std_net_out", "sum_tcp_retrans", "sum_tcp_fast_retrans", "sum_tcp_timeouts", "avg_net_drops",
|
||||||
|
// Disk (4)
|
||||||
|
"avg_disk_read", "avg_disk_write", "avg_disk_io_ticks", "std_disk_io_ticks",
|
||||||
|
// Log (2)
|
||||||
|
"error_count", "severity_score",
|
||||||
|
}
|
||||||
|
|
||||||
|
// ScalerFeatureNames returns the ordered list of feature names stored in
|
||||||
|
// scaler_params.
|
||||||
|
func ScalerFeatureNames() []string { return scalerFeatureNames }
|
||||||
|
|
||||||
|
func BuildScalerParamsTable() string {
|
||||||
|
return `CREATE TABLE IF NOT EXISTS scaler_params (
|
||||||
|
feature_name VARCHAR PRIMARY KEY,
|
||||||
|
mean DOUBLE NOT NULL,
|
||||||
|
std DOUBLE NOT NULL
|
||||||
|
)`
|
||||||
|
}
|
||||||
|
|
||||||
|
func BuildFitScalerQuery() string {
|
||||||
|
return `
|
||||||
|
INSERT OR REPLACE INTO scaler_params (feature_name, mean, std)
|
||||||
|
WITH stats AS (
|
||||||
|
SELECT
|
||||||
|
-- CPU
|
||||||
|
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY cpu_percent) AS m_avg_cpu,
|
||||||
|
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY cpu_percent) AS m_max_cpu, -- Approximation
|
||||||
|
0.0 AS m_std_cpu, -- Baseline std is often 0 or low
|
||||||
|
-- System
|
||||||
|
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY cpu_iowait_percent) AS m_avg_iowait,
|
||||||
|
0.0 AS m_std_iowait,
|
||||||
|
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY cpu_softirq_percent) AS m_avg_softirq,
|
||||||
|
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY context_switches_s) AS m_avg_ctx_switches,
|
||||||
|
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY interrupts_s) AS m_avg_interrupts,
|
||||||
|
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY softnet_dropped_s) AS m_avg_softnet_dropped,
|
||||||
|
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY softnet_time_squeeze_s) AS m_avg_softnet_squeeze,
|
||||||
|
-- Network
|
||||||
|
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY net_in_mbps) AS m_avg_net_in,
|
||||||
|
0.0 AS m_std_net_in,
|
||||||
|
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY net_out_mbps) AS m_avg_net_out,
|
||||||
|
0.0 AS m_std_net_out,
|
||||||
|
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY tcp_retrans_s) AS m_sum_tcp_retrans,
|
||||||
|
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY tcp_fast_retrans_s) AS m_sum_tcp_fast_retrans,
|
||||||
|
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY tcp_timeouts_s) AS m_sum_tcp_timeouts,
|
||||||
|
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY network_drops_s) AS m_avg_net_drops,
|
||||||
|
-- Disk
|
||||||
|
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY disk_read_mbps) AS m_avg_disk_read,
|
||||||
|
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY disk_write_mbps) AS m_avg_disk_write,
|
||||||
|
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY disk_io_ticks_s) AS m_avg_disk_io_ticks,
|
||||||
|
0.0 AS m_std_disk_io_ticks,
|
||||||
|
|
||||||
|
-- IQRs for scaling
|
||||||
|
(PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY cpu_percent) - PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY cpu_percent)) AS s_avg_cpu,
|
||||||
|
(PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY cpu_iowait_percent) - PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY cpu_iowait_percent)) AS s_avg_iowait,
|
||||||
|
(PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY net_in_mbps) - PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY net_in_mbps)) AS s_avg_net_in,
|
||||||
|
(PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY net_out_mbps) - PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY net_out_mbps)) AS s_avg_net_out,
|
||||||
|
(PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY disk_io_ticks_s) - PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY disk_io_ticks_s)) AS s_avg_disk_io_ticks
|
||||||
|
FROM raw_metrics
|
||||||
|
WHERE timestamp >= $1 AND timestamp < $2
|
||||||
|
)
|
||||||
|
SELECT feature_name, mean, std FROM (
|
||||||
|
SELECT 'avg_cpu' AS feature_name, s.m_avg_cpu AS mean, GREATEST(s.s_avg_cpu, 1e-9) AS std FROM stats s UNION ALL
|
||||||
|
SELECT 'max_cpu', s.m_max_cpu, GREATEST(s.s_avg_cpu, 1e-9) FROM stats s UNION ALL
|
||||||
|
SELECT 'std_cpu', 0.0, 1.0 FROM stats s UNION ALL
|
||||||
|
SELECT 'avg_iowait', s.m_avg_iowait, GREATEST(s.s_avg_iowait, 1e-9) FROM stats s UNION ALL
|
||||||
|
SELECT 'std_iowait', 0.0, 1.0 FROM stats s UNION ALL
|
||||||
|
SELECT 'avg_softirq', s.m_avg_softirq, 1.0 FROM stats s UNION ALL
|
||||||
|
SELECT 'avg_ctx_switches', s.m_avg_ctx_switches, 1.0 FROM stats s UNION ALL
|
||||||
|
SELECT 'avg_interrupts', s.m_avg_interrupts, 1.0 FROM stats s UNION ALL
|
||||||
|
SELECT 'avg_softnet_dropped', s.m_avg_softnet_dropped, 1.0 FROM stats s UNION ALL
|
||||||
|
SELECT 'avg_softnet_squeeze', s.m_avg_softnet_squeeze, 1.0 FROM stats s UNION ALL
|
||||||
|
SELECT 'avg_net_in', s.m_avg_net_in, GREATEST(s.s_avg_net_in, 1e-9) FROM stats s UNION ALL
|
||||||
|
SELECT 'std_net_in', 0.0, 1.0 FROM stats s UNION ALL
|
||||||
|
SELECT 'avg_net_out', s.m_avg_net_out, GREATEST(s.s_avg_net_out, 1e-9) FROM stats s UNION ALL
|
||||||
|
SELECT 'std_net_out', 0.0, 1.0 FROM stats s UNION ALL
|
||||||
|
SELECT 'sum_tcp_retrans', s.m_sum_tcp_retrans, 1.0 FROM stats s UNION ALL
|
||||||
|
SELECT 'sum_tcp_fast_retrans', s.m_sum_tcp_fast_retrans, 1.0 FROM stats s UNION ALL
|
||||||
|
SELECT 'sum_tcp_timeouts', s.m_sum_tcp_timeouts, 1.0 FROM stats s UNION ALL
|
||||||
|
SELECT 'avg_net_drops', s.m_avg_net_drops, 1.0 FROM stats s UNION ALL
|
||||||
|
SELECT 'avg_disk_read', s.m_avg_disk_read, 1.0 FROM stats s UNION ALL
|
||||||
|
SELECT 'avg_disk_write', s.m_avg_disk_write, 1.0 FROM stats s UNION ALL
|
||||||
|
SELECT 'avg_disk_io_ticks', s.m_avg_disk_io_ticks, GREATEST(s.s_avg_disk_io_ticks, 1e-9) FROM stats s UNION ALL
|
||||||
|
SELECT 'std_disk_io_ticks', 0.0, 1.0 FROM stats s UNION ALL
|
||||||
|
SELECT 'error_count', 0.0, 1.0 UNION ALL
|
||||||
|
SELECT 'severity_score', 0.0, 1.0
|
||||||
|
) t`
|
||||||
|
}
|
||||||
|
|
||||||
|
func BuildFusionQuery(maskingPatterns []config.MaskingPattern, systemctlServices []string, windowInterval string) string {
|
||||||
|
numericCols := collectNumericCols(maskingPatterns)
|
||||||
|
paramCTE := ""
|
||||||
|
paramSelect := ""
|
||||||
|
paramJoin := ""
|
||||||
|
if len(numericCols) > 0 {
|
||||||
|
var aggs []string
|
||||||
|
for _, col := range numericCols {
|
||||||
|
aggs = append(aggs, fmt.Sprintf("AVG(%s) AS avg_%s", col, col))
|
||||||
|
paramSelect += fmt.Sprintf(", COALESCE(p.avg_%s, 0.0) AS avg_%s", col, col)
|
||||||
|
}
|
||||||
|
paramCTE = fmt.Sprintf(`, param_agg AS (SELECT time_bucket(INTERVAL '%s', event_time) AS ws, %s FROM log_params GROUP BY 1)`, windowInterval, strings.Join(aggs, ", "))
|
||||||
|
paramJoin = "LEFT JOIN param_agg p ON m.ws = p.ws"
|
||||||
|
}
|
||||||
|
|
||||||
|
svcCTE := ""
|
||||||
|
svcSelect := ""
|
||||||
|
svcJoin := ""
|
||||||
|
if len(systemctlServices) > 0 {
|
||||||
|
var svcAggs []string
|
||||||
|
for _, svc := range systemctlServices {
|
||||||
|
safeName := strings.ReplaceAll(strings.ReplaceAll(svc, ".", "_"), "-", "_")
|
||||||
|
svcAggs = append(svcAggs, fmt.Sprintf(`MODE(CASE WHEN active_state = 'active' THEN 1 WHEN active_state = 'failed' THEN -1 ELSE 0 END) AS state_%s`, safeName))
|
||||||
|
svcSelect += fmt.Sprintf(", COALESCE(s.state_%s, 0) AS svc_%s", safeName, safeName)
|
||||||
|
}
|
||||||
|
svcCTE = fmt.Sprintf(`, svc_agg AS (SELECT time_bucket(INTERVAL '%s', timestamp) AS ws, %s FROM service_status GROUP BY 1)`, windowInterval, strings.Join(svcAggs, ", "))
|
||||||
|
svcJoin = "LEFT JOIN svc_agg s ON m.ws = s.ws"
|
||||||
|
}
|
||||||
|
|
||||||
|
var scFields []string
|
||||||
|
for _, name := range scalerFeatureNames {
|
||||||
|
scFields = append(scFields, fmt.Sprintf("COALESCE(MAX(CASE WHEN feature_name='%s' THEN mean END),0) AS m_%s, COALESCE(MAX(CASE WHEN feature_name='%s' THEN std END),1) AS s_%s", name, name, name, name))
|
||||||
|
}
|
||||||
|
|
||||||
|
var normVecFields []string
|
||||||
|
for _, name := range scalerFeatureNames {
|
||||||
|
// DuckDB aggregation aliases match these exactly (see metric_agg and log_agg below)
|
||||||
|
src := name
|
||||||
|
if name == "severity_score" || name == "error_count" {
|
||||||
|
src = "l." + name
|
||||||
|
} else {
|
||||||
|
src = "m." + name
|
||||||
|
}
|
||||||
|
normVecFields = append(normVecFields, fmt.Sprintf("(COALESCE(%s, 0.0) - sc.m_%s) / sc.s_%s AS sc_%s", src, name, name, name))
|
||||||
|
}
|
||||||
|
|
||||||
|
return fmt.Sprintf(`
|
||||||
|
WITH metric_agg AS (
|
||||||
|
SELECT
|
||||||
|
time_bucket(INTERVAL '%[1]s', timestamp) AS ws,
|
||||||
|
AVG(cpu_percent) AS avg_cpu, MAX(cpu_percent) AS max_cpu, STDDEV_SAMP(cpu_percent) AS std_cpu,
|
||||||
|
AVG(cpu_iowait_percent) AS avg_iowait, STDDEV_SAMP(cpu_iowait_percent) AS std_iowait,
|
||||||
|
AVG(cpu_softirq_percent) AS avg_softirq, AVG(context_switches_s) AS avg_ctx_switches,
|
||||||
|
AVG(interrupts_s) AS avg_interrupts, AVG(softnet_dropped_s) AS avg_softnet_dropped,
|
||||||
|
AVG(softnet_time_squeeze_s) AS avg_softnet_squeeze,
|
||||||
|
AVG(memory_used_mb) AS avg_mem_used, AVG(memory_cached_mb) AS avg_mem_cached, MAX(memory_dirty_mb) AS max_mem_dirty,
|
||||||
|
AVG(net_in_mbps) AS avg_net_in, STDDEV_SAMP(net_in_mbps) AS std_net_in,
|
||||||
|
AVG(net_out_mbps) AS avg_net_out, STDDEV_SAMP(net_out_mbps) AS std_net_out,
|
||||||
|
SUM(tcp_retrans_s) AS sum_tcp_retrans, SUM(tcp_fast_retrans_s) AS sum_tcp_fast_retrans,
|
||||||
|
SUM(tcp_timeouts_s) AS sum_tcp_timeouts, AVG(network_drops_s) AS avg_net_drops,
|
||||||
|
AVG(disk_read_mbps) AS avg_disk_read, AVG(disk_write_mbps) AS avg_disk_write,
|
||||||
|
AVG(disk_io_ticks_s) AS avg_disk_io_ticks, STDDEV_SAMP(disk_io_ticks_s) AS std_disk_io_ticks,
|
||||||
|
SUM(disk_read_time_s) AS sum_disk_read_time, SUM(disk_write_time_s) AS sum_disk_write_time,
|
||||||
|
SUM(disk_reads_s) AS sum_disk_reads, SUM(disk_writes_s) AS sum_disk_writes,
|
||||||
|
SUM(net_packets_in_s) AS sum_packets_in, SUM(net_packets_out_s) AS sum_packets_out
|
||||||
|
FROM raw_metrics GROUP BY 1
|
||||||
|
),
|
||||||
|
log_agg AS (
|
||||||
|
SELECT
|
||||||
|
time_bucket(INTERVAL '%[1]s', timestamp) AS ws,
|
||||||
|
COUNT(*) AS log_event_count, COUNT(DISTINCT template_id) AS unique_templates,
|
||||||
|
SUM(CASE WHEN severity = 'ERROR' THEN 1 ELSE 0 END) AS error_count,
|
||||||
|
SUM(CASE
|
||||||
|
WHEN severity = 'ERROR' THEN 10
|
||||||
|
WHEN severity = 'WARN' THEN 3
|
||||||
|
ELSE 1
|
||||||
|
END) AS severity_score
|
||||||
|
FROM log_events GROUP BY 1
|
||||||
|
)%[2]s%[3]s,
|
||||||
|
scaler AS (
|
||||||
|
SELECT %[4]s FROM scaler_params
|
||||||
|
)
|
||||||
|
SELECT m.ws,
|
||||||
|
m.*, l.log_event_count, l.unique_templates, l.error_count, l.severity_score%[5]s%[6]s,
|
||||||
|
%[7]s
|
||||||
|
FROM metric_agg m
|
||||||
|
LEFT JOIN log_agg l ON m.ws = l.ws
|
||||||
|
%[8]s %[9]s
|
||||||
|
CROSS JOIN scaler sc
|
||||||
|
ORDER BY m.ws DESC LIMIT 1`,
|
||||||
|
windowInterval, paramCTE, svcCTE, strings.Join(scFields, ", "), paramSelect, svcSelect, strings.Join(normVecFields, ", "), paramJoin, svcJoin)
|
||||||
|
}
|
||||||
|
|
||||||
|
func BuildLogParamsSchema(patterns []config.MaskingPattern) string {
|
||||||
|
cols := []string{"event_time TIMESTAMP WITH TIME ZONE"}
|
||||||
|
for _, mp := range patterns {
|
||||||
|
if mp.Name == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
cols = append(cols, fmt.Sprintf("param_%s %s", mp.Name, sqlType(mp.Type)))
|
||||||
|
}
|
||||||
|
return fmt.Sprintf("CREATE TABLE IF NOT EXISTS log_params (\n\t%s\n)", strings.Join(cols, ",\n\t"))
|
||||||
|
}
|
||||||
|
|
||||||
|
func sqlType(t string) string {
|
||||||
|
switch t {
|
||||||
|
case "float":
|
||||||
|
return "DOUBLE"
|
||||||
|
case "int":
|
||||||
|
return "BIGINT"
|
||||||
|
default:
|
||||||
|
return "VARCHAR"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func collectNumericCols(patterns []config.MaskingPattern) []string {
|
||||||
|
var cols []string
|
||||||
|
for _, mp := range patterns {
|
||||||
|
if mp.Name == "" || mp.Type == "string" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
cols = append(cols, "param_"+mp.Name)
|
||||||
|
}
|
||||||
|
return cols
|
||||||
|
}
|
||||||
302
pkg/types/types.go
Normal file
302
pkg/types/types.go
Normal file
|
|
@ -0,0 +1,302 @@
|
||||||
|
// Package types defines the shared data structures that flow between pipeline
|
||||||
|
// stages. All types are value-safe to copy and JSON-serialisable.
|
||||||
|
package types
|
||||||
|
|
||||||
|
import "time"
|
||||||
|
|
||||||
|
// ── LogEvent ─────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
// LogEvent represents a single parsed log line after Drain3 template mining.
|
||||||
|
type LogEvent struct {
|
||||||
|
Timestamp time.Time `json:"timestamp"`
|
||||||
|
TemplateID int `json:"template_id"`
|
||||||
|
Params map[string]string `json:"params"`
|
||||||
|
Severity string `json:"severity"`
|
||||||
|
RawLine string `json:"raw_line"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── ServiceStatus ─────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
// ServiceStatus represents the state of a systemd service.
|
||||||
|
type ServiceStatus struct {
|
||||||
|
Timestamp time.Time `json:"timestamp"`
|
||||||
|
ServiceName string `json:"service_name"`
|
||||||
|
ActiveState string `json:"active_state"` // e.g. "active", "inactive", "failed"
|
||||||
|
SubState string `json:"sub_state"` // e.g. "running", "dead", "exited"
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── MetricSnapshot ────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
// MetricSnapshot is a 1 Hz sample of Linux system metrics collected from /proc.
|
||||||
|
type MetricSnapshot struct {
|
||||||
|
Timestamp time.Time `json:"timestamp"`
|
||||||
|
|
||||||
|
CPUPercent float64 `json:"cpu_percent"`
|
||||||
|
CPUIoWaitPercent float64 `json:"cpu_iowait_percent"`
|
||||||
|
CPUSoftIrqPercent float64 `json:"cpu_softirq_percent"`
|
||||||
|
|
||||||
|
ContextSwitchesPerS float64 `json:"context_switches_s"`
|
||||||
|
InterruptsPerS float64 `json:"interrupts_s"`
|
||||||
|
|
||||||
|
MemoryUsedMB float64 `json:"memory_used_mb"`
|
||||||
|
MemoryCachedMB float64 `json:"memory_cached_mb"`
|
||||||
|
MemoryDirtyMB float64 `json:"memory_dirty_mb"`
|
||||||
|
|
||||||
|
NetworkInMBps float64 `json:"net_in_mbps"`
|
||||||
|
NetworkOutMBps float64 `json:"net_out_mbps"`
|
||||||
|
|
||||||
|
NetErrorsPerS float64 `json:"network_errors_s"`
|
||||||
|
NetDropsPerS float64 `json:"network_drops_s"`
|
||||||
|
|
||||||
|
TCPRetransPerS float64 `json:"tcp_retrans_s"`
|
||||||
|
TCPTimeoutsPerS float64 `json:"tcp_timeouts_s"`
|
||||||
|
TCPLostRetransmitPerS float64 `json:"tcp_lost_retransmit_s"`
|
||||||
|
TCPFastRetransPerS float64 `json:"tcp_fast_retrans_s"`
|
||||||
|
|
||||||
|
DiskReadMBps float64 `json:"disk_read_mbps"`
|
||||||
|
DiskWriteMBps float64 `json:"disk_write_mbps"`
|
||||||
|
DiskReadTimeMsPerS float64 `json:"disk_read_time_s"`
|
||||||
|
DiskWriteTimeMsPerS float64 `json:"disk_write_time_s"`
|
||||||
|
DiskIOTicksPerS float64 `json:"disk_io_ticks_s"`
|
||||||
|
|
||||||
|
SoftnetDroppedPerS float64 `json:"softnet_dropped_s"`
|
||||||
|
SoftnetTimeSqueezePerS float64 `json:"softnet_time_squeeze_s"`
|
||||||
|
|
||||||
|
NetPacketsInPerS float64 `json:"net_packets_in_s"`
|
||||||
|
NetPacketsOutPerS float64 `json:"net_packets_out_s"`
|
||||||
|
|
||||||
|
DiskReadsCompletedPerS float64 `json:"disk_reads_s"`
|
||||||
|
DiskWritesCompletedPerS float64 `json:"disk_writes_s"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── FeatureVector ─────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
// FeatureVector is the output of the DuckDB Tumbling-Window fusion layer.
|
||||||
|
//
|
||||||
|
// # NormalizedVector layout
|
||||||
|
//
|
||||||
|
// Slot 0– 4: CPU (DuckDB RobustScaled)
|
||||||
|
// 0=avg_cpu 1=max_cpu 2=avg_iowait 3=avg_softirq 4=avg_ctx_switches
|
||||||
|
// Slot 5– 7: Memory (DuckDB RobustScaled)
|
||||||
|
// 5=avg_mem_used 6=avg_mem_cached 7=max_mem_dirty
|
||||||
|
// Slot 8: Disk (DuckDB RobustScaled)
|
||||||
|
// 8=avg_disk_io_ticks
|
||||||
|
// Slot 9–12: Network (DuckDB RobustScaled)
|
||||||
|
// 9=avg_net_in 10=avg_net_out 11=avg_net_drops 12=avg_softnet_squeeze
|
||||||
|
// Slot 13–16: TCP (DuckDB RobustScaled)
|
||||||
|
// 13=max_tcp_retrans 14=sum_tcp_fast_retrans
|
||||||
|
// 15=sum_tcp_timeouts 16=sum_tcp_lost_retrans
|
||||||
|
// Slot 17–20: Log (DuckDB RobustScaled)
|
||||||
|
// 17=log_event_count 18=error_count 19=unique_templates 20=error_rate
|
||||||
|
// Slot 21: CPUDelta – Δavg_cpu vs previous window, %-points (unscaled)
|
||||||
|
// Slot 22: RatioTCPNet – sum_tcp_retrans / (avg_net_out + 1e-3), CV=10 (NEW)
|
||||||
|
// Slot 23: DeltaCtx – Δavg_ctx_switches vs previous window, CV=6.2 (NEW)
|
||||||
|
// Slot 24: NetDelta – Δavg_net_out vs previous window, MBps (unscaled)
|
||||||
|
// Slot 25: CPURollStd – rolling σ(avg_cpu, 12 windows) (unscaled)
|
||||||
|
// Slot 26: CPUEfficiency – avg_cpu / (avg_net_out + 1) (unscaled)
|
||||||
|
// Slot 27: IOWaitProxy – avg_disk_io_ticks / (avg_cpu + 1) (unscaled)
|
||||||
|
// Slot 28: LogDensity – unique_templates / (log_count + 1) (unscaled)
|
||||||
|
// Slot 29: DeltaNetIn – Δavg_net_in vs previous window, MBps (unscaled)
|
||||||
|
// Slot 30: DeltaTCPRetrans – Δsum_tcp_retrans vs previous window (unscaled)
|
||||||
|
// Slot 31: TcpRollStd – rolling σ(sum_tcp_retrans, 5 windows) (unscaled)
|
||||||
|
// Slot 32: NetRollStd – rolling σ(avg_net_out, 5 windows) (unscaled)
|
||||||
|
// Slot 33: MemPressure – avg_dirty_mb / (avg_mem_used + 1) (unscaled)
|
||||||
|
// Slot 34: NetAsymmetry – avg_net_in / (avg_net_out + 1e-3) (unscaled)
|
||||||
|
// Slot 35+: Drain param averages (unscaled)
|
||||||
|
type FeatureVector struct {
|
||||||
|
Timestamp time.Time `json:"timestamp"`
|
||||||
|
WindowStart time.Time `json:"window_start"`
|
||||||
|
WindowEnd time.Time `json:"window_end"`
|
||||||
|
|
||||||
|
// CPU aggregations
|
||||||
|
AvgCPUPercent float64 `json:"avg_cpu"`
|
||||||
|
MaxCPUPercent float64 `json:"max_cpu"`
|
||||||
|
StdCPUPercent float64 `json:"std_cpu"`
|
||||||
|
AvgCPUIoWait float64 `json:"avg_iowait"`
|
||||||
|
StdCPUIoWait float64 `json:"std_iowait"`
|
||||||
|
AvgCPUSoftIrq float64 `json:"avg_softirq"`
|
||||||
|
AvgCtxSwitches float64 `json:"avg_ctx_switches"`
|
||||||
|
AvgInterrupts float64 `json:"avg_interrupts"`
|
||||||
|
|
||||||
|
// Memory aggregations
|
||||||
|
AvgMemUsedMB float64 `json:"avg_mem_used"`
|
||||||
|
AvgMemCachedMB float64 `json:"avg_mem_cached"`
|
||||||
|
MaxMemDirtyMB float64 `json:"max_mem_dirty"`
|
||||||
|
|
||||||
|
// Disk aggregations
|
||||||
|
AvgDiskIOTicks float64 `json:"avg_disk_io_ticks"`
|
||||||
|
StdDiskIOTicks float64 `json:"std_disk_io_ticks"`
|
||||||
|
AvgDiskReadMBps float64 `json:"avg_disk_read"`
|
||||||
|
AvgDiskWriteMBps float64 `json:"avg_disk_write"`
|
||||||
|
|
||||||
|
// Network aggregations
|
||||||
|
AvgNetInMBps float64 `json:"avg_net_in"`
|
||||||
|
StdNetInMBps float64 `json:"std_net_in"`
|
||||||
|
AvgNetOutMBps float64 `json:"avg_net_out"`
|
||||||
|
StdNetOutMBps float64 `json:"std_net_out"`
|
||||||
|
AvgNetDrops float64 `json:"avg_net_drops"`
|
||||||
|
AvgSoftnetDropped float64 `json:"avg_softnet_dropped"`
|
||||||
|
AvgSoftnetSqueeze float64 `json:"avg_softnet_squeeze"`
|
||||||
|
|
||||||
|
// TCP aggregations
|
||||||
|
SumTCPRetrans float64 `json:"sum_tcp_retrans"`
|
||||||
|
SumTCPFastRetrans float64 `json:"sum_tcp_fast_retrans"`
|
||||||
|
SumTCPTimeouts float64 `json:"sum_tcp_timeouts"`
|
||||||
|
|
||||||
|
// Log aggregations
|
||||||
|
ErrorCount int `json:"error_count"`
|
||||||
|
SeverityScore float64 `json:"severity_score"`
|
||||||
|
|
||||||
|
// Engineered / Derived features
|
||||||
|
CPUDelta float64 `json:"cpu_delta"`
|
||||||
|
CPURollStd float64 `json:"cpu_roll_std"`
|
||||||
|
CPUEfficiency float64 `json:"cpu_efficiency"`
|
||||||
|
DeltaCtx float64 `json:"delta_ctx"`
|
||||||
|
NetDelta float64 `json:"net_delta"`
|
||||||
|
AvgNetThroughput float64 `json:"avg_net_throughput"`
|
||||||
|
CPUPerMB float64 `json:"cpu_per_mb"`
|
||||||
|
NetworkDiskRatio float64 `json:"network_disk_ratio"`
|
||||||
|
RetransPerPacket float64 `json:"retrans_per_packet"`
|
||||||
|
RetransPerMB float64 `json:"retrans_per_mb"`
|
||||||
|
AvgDiskLatencyMS float64 `json:"avg_disk_latency_ms"`
|
||||||
|
LogCountTotal int `json:"log_count_total"`
|
||||||
|
UniqueTemplates int `json:"unique_templates"`
|
||||||
|
LogDensity float64 `json:"log_density"`
|
||||||
|
IOWaitProxy float64 `json:"io_wait_proxy"`
|
||||||
|
DeltaNetIn float64 `json:"delta_net_in"`
|
||||||
|
DeltaTCPRetrans float64 `json:"delta_tcp_retrans"`
|
||||||
|
TcpRollStd float64 `json:"tcp_roll_std"`
|
||||||
|
NetRollStd float64 `json:"net_roll_std"`
|
||||||
|
MemPressure float64 `json:"mem_pressure"`
|
||||||
|
NetAsymmetry float64 `json:"net_asymmetry"`
|
||||||
|
|
||||||
|
// Drain parameter aggregations
|
||||||
|
ParamAvg map[string]float64 `json:"param_avg"`
|
||||||
|
|
||||||
|
// ServiceStatuses maps service names to their encoded state (active=1, inactive=0, failed=-1).
|
||||||
|
ServiceStatuses map[string]float64 `json:"service_statuses"`
|
||||||
|
|
||||||
|
// NormalizedVector is the flat float64 slice consumed by anomaly detectors.
|
||||||
|
NormalizedVector []float64 `json:"normalized_vector"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// ToFloatSlice serialises fv to a deterministic []float64 for offline EDA.
|
||||||
|
// Returns raw (unscaled) values; use NormalizedVector for ML inference.
|
||||||
|
//
|
||||||
|
// [avg_cpu, max_cpu, std_cpu,
|
||||||
|
// avg_iowait, std_iowait, avg_softirq, avg_ctx_switches, avg_interrupts,
|
||||||
|
// avg_softnet_dropped, avg_softnet_squeeze,
|
||||||
|
// avg_net_in, std_net_in, avg_net_out, std_net_out,
|
||||||
|
// sum_tcp_retrans, sum_tcp_fast_retrans, sum_tcp_timeouts, avg_net_drops,
|
||||||
|
// avg_disk_read, avg_disk_write, avg_disk_io_ticks, std_disk_io_ticks,
|
||||||
|
// error_count, severity_score,
|
||||||
|
// cpu_delta, cpu_roll_std, cpu_efficiency, delta_ctx, net_delta,
|
||||||
|
// avg_net_throughput, cpu_per_mb, network_disk_ratio, retrans_per_packet,
|
||||||
|
// retrans_per_mb, avg_disk_latency_ms, log_count_total, unique_templates,
|
||||||
|
// log_density, io_wait_proxy, delta_net_in, delta_tcp_retrans,
|
||||||
|
// tcp_roll_std, net_roll_std, mem_pressure, net_asymmetry,
|
||||||
|
// param_*]
|
||||||
|
func (fv FeatureVector) ToFloatSlice(paramNames []string) []float64 {
|
||||||
|
out := make([]float64, 0, 45+len(paramNames))
|
||||||
|
out = append(out,
|
||||||
|
// Base Aggregates (24)
|
||||||
|
fv.AvgCPUPercent, fv.MaxCPUPercent, fv.StdCPUPercent,
|
||||||
|
fv.AvgCPUIoWait, fv.StdCPUIoWait, fv.AvgCPUSoftIrq, fv.AvgCtxSwitches, fv.AvgInterrupts,
|
||||||
|
fv.AvgSoftnetDropped, fv.AvgSoftnetSqueeze,
|
||||||
|
fv.AvgNetInMBps, fv.StdNetInMBps, fv.AvgNetOutMBps, fv.StdNetOutMBps,
|
||||||
|
fv.SumTCPRetrans, fv.SumTCPFastRetrans, fv.SumTCPTimeouts, fv.AvgNetDrops,
|
||||||
|
fv.AvgDiskReadMBps, fv.AvgDiskWriteMBps, fv.AvgDiskIOTicks, fv.StdDiskIOTicks,
|
||||||
|
float64(fv.ErrorCount), fv.SeverityScore,
|
||||||
|
|
||||||
|
// Engineered Features (21)
|
||||||
|
fv.CPUDelta, fv.CPURollStd, fv.CPUEfficiency, fv.DeltaCtx, fv.NetDelta,
|
||||||
|
fv.AvgNetThroughput, fv.CPUPerMB, fv.NetworkDiskRatio, fv.RetransPerPacket,
|
||||||
|
fv.RetransPerMB, fv.AvgDiskLatencyMS, float64(fv.LogCountTotal),
|
||||||
|
float64(fv.UniqueTemplates), fv.LogDensity, fv.IOWaitProxy,
|
||||||
|
fv.DeltaNetIn, fv.DeltaTCPRetrans, fv.TcpRollStd, fv.NetRollStd,
|
||||||
|
fv.MemPressure, fv.NetAsymmetry,
|
||||||
|
)
|
||||||
|
for _, name := range paramNames {
|
||||||
|
out = append(out, fv.ParamAvg[name])
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
// ToNamedMap returns the feature vector as map[string]float64
|
||||||
|
func (fv FeatureVector) ToNamedMap(paramNames []string) map[string]float64 {
|
||||||
|
m := map[string]float64{
|
||||||
|
"avg_cpu": fv.AvgCPUPercent,
|
||||||
|
"max_cpu": fv.MaxCPUPercent,
|
||||||
|
"std_cpu": fv.StdCPUPercent,
|
||||||
|
"avg_iowait": fv.AvgCPUIoWait,
|
||||||
|
"std_iowait": fv.StdCPUIoWait,
|
||||||
|
"avg_softirq": fv.AvgCPUSoftIrq,
|
||||||
|
"avg_ctx_switches": fv.AvgCtxSwitches,
|
||||||
|
"avg_interrupts": fv.AvgInterrupts,
|
||||||
|
"avg_softnet_dropped": fv.AvgSoftnetDropped,
|
||||||
|
"avg_softnet_squeeze": fv.AvgSoftnetSqueeze,
|
||||||
|
"avg_net_in": fv.AvgNetInMBps,
|
||||||
|
"std_net_in": fv.StdNetInMBps,
|
||||||
|
"avg_net_out": fv.AvgNetOutMBps,
|
||||||
|
"std_net_out": fv.StdNetOutMBps,
|
||||||
|
"avg_net_drops": fv.AvgNetDrops,
|
||||||
|
"sum_tcp_retrans": fv.SumTCPRetrans,
|
||||||
|
"sum_tcp_fast_retrans": fv.SumTCPFastRetrans,
|
||||||
|
"sum_tcp_timeouts": fv.SumTCPTimeouts,
|
||||||
|
"avg_disk_read": fv.AvgDiskReadMBps,
|
||||||
|
"avg_disk_write": fv.AvgDiskWriteMBps,
|
||||||
|
"avg_disk_io_ticks": fv.AvgDiskIOTicks,
|
||||||
|
"std_disk_io_ticks": fv.StdDiskIOTicks,
|
||||||
|
"error_count": float64(fv.ErrorCount),
|
||||||
|
"severity_score": fv.SeverityScore,
|
||||||
|
"cpu_delta": fv.CPUDelta,
|
||||||
|
"cpu_roll_std": fv.CPURollStd,
|
||||||
|
"cpu_efficiency": fv.CPUEfficiency,
|
||||||
|
"delta_ctx": fv.DeltaCtx,
|
||||||
|
"net_delta": fv.NetDelta,
|
||||||
|
"avg_net_throughput": fv.AvgNetThroughput,
|
||||||
|
"cpu_per_mb": fv.CPUPerMB,
|
||||||
|
"network_disk_ratio": fv.NetworkDiskRatio,
|
||||||
|
"retrans_per_packet": fv.RetransPerPacket,
|
||||||
|
"retrans_per_mb": fv.RetransPerMB,
|
||||||
|
"avg_disk_latency_ms": fv.AvgDiskLatencyMS,
|
||||||
|
"log_count_total": float64(fv.LogCountTotal),
|
||||||
|
"unique_templates": float64(fv.UniqueTemplates),
|
||||||
|
"log_density": fv.LogDensity,
|
||||||
|
"io_wait_proxy": fv.IOWaitProxy,
|
||||||
|
"delta_net_in": fv.DeltaNetIn,
|
||||||
|
"delta_tcp_retrans": fv.DeltaTCPRetrans,
|
||||||
|
"tcp_roll_std": fv.TcpRollStd,
|
||||||
|
"net_roll_std": fv.NetRollStd,
|
||||||
|
"mem_pressure": fv.MemPressure,
|
||||||
|
"net_asymmetry": fv.NetAsymmetry,
|
||||||
|
}
|
||||||
|
for _, name := range paramNames {
|
||||||
|
m["avg_param_"+name] = fv.ParamAvg[name]
|
||||||
|
}
|
||||||
|
return m
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── AnomalyResult ─────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
// AnomalyResult is the final output of the detection layer.
|
||||||
|
type AnomalyResult struct {
|
||||||
|
Timestamp time.Time `json:"timestamp"`
|
||||||
|
Score float64 `json:"score"`
|
||||||
|
IsAnomaly bool `json:"is_anomaly"`
|
||||||
|
Confidence float64 `json:"confidence"`
|
||||||
|
Method string `json:"method"`
|
||||||
|
Details string `json:"details,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── StageHealth ───────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
// StageHealth stores per-stage monitoring counters.
|
||||||
|
type StageHealth struct {
|
||||||
|
StageName string `json:"stage_name"`
|
||||||
|
EventsProcessed uint64 `json:"events_processed"`
|
||||||
|
EventsDropped uint64 `json:"events_dropped"`
|
||||||
|
AvgLatencyMs float64 `json:"avg_latency_ms"`
|
||||||
|
Throughput float64 `json:"throughput_eps"`
|
||||||
|
LastUpdate time.Time `json:"last_update"`
|
||||||
|
}
|
||||||
Loading…
Add table
Add a link
Reference in a new issue