commit for version used in evaluation of thesis

This commit is contained in:
Patryk Hegenberg 2026-03-29 10:03:18 +02:00
commit 72635dc7b9
27 changed files with 6084 additions and 0 deletions

203
internal/config/config.go Normal file
View file

@ -0,0 +1,203 @@
// Package config provides the pipeline configuration loaded from YAML.
package config
import (
"fmt"
"os"
"regexp"
"time"
"gopkg.in/yaml.v3"
)
// MaskingPattern is a single entry in drain.masking_patterns.
type MaskingPattern struct {
Name string `yaml:"name"`
Pattern string `yaml:"pattern"`
Replace string `yaml:"replace"`
Type string `yaml:"type"`
Re *regexp.Regexp
}
// MADConfig defines parameters for the MAD detector.
type MADConfig struct {
// Threshold is the modified Z-score cutoff for IsAnomaly.
// Recommended: 3.04.0. Default: 3.5.
Threshold float64 `yaml:"threshold"`
// CalibrationSize is the number of NormalizedVectors to buffer before
// automatic per-feature median/MAD calibration runs.
// Default (if 0): 100.
CalibrationSize int `yaml:"calibration_size"`
}
// COPODConfig defines the parameters for the Copula-Based Outlier detector.
type COPODConfig struct {
Threshold float64 `yaml:"threshold"`
BufferSize int `yaml:"buffer_size"`
}
// RRCFConfig defines the parameters for the Robust Random Cut Forest detector.
// Used for the standalone RRCF detector and the classic AVG/MAX/MEDIAN ensemble paths.
type RRCFConfig struct {
NumTrees int `yaml:"num_trees"`
TreeSize int `yaml:"tree_size"`
ThresholdPercentile float64 `yaml:"threshold_percentile"`
}
// RRCFVariantConfig holds parameters for a single named RRCF variant inside
// the SEAD multi-horizon ensemble.
type RRCFVariantConfig struct {
// NumTrees controls score stability: more trees → smoother/conservative.
NumTrees int `yaml:"num_trees"`
// TreeSize sets the sliding-window capacity per tree.
TreeSize int `yaml:"tree_size"`
// ThresholdPercentile is the per-model decision threshold (standalone use).
ThresholdPercentile float64 `yaml:"threshold_percentile"`
}
// RRCFVariantsConfig groups the three RRCF variants used by the SEAD ensemble.
// Each variant captures anomalies at a different time-horizon:
// - Fast: short memory, reactive to transient spikes
// - Mid: medium memory, balanced sensitivity
// - Slow: long memory, detects sustained / slow-drift events
type RRCFVariantsConfig struct {
Fast RRCFVariantConfig `yaml:"fast"`
Mid RRCFVariantConfig `yaml:"mid"`
Slow RRCFVariantConfig `yaml:"slow"`
}
// SEADConfig holds tunable parameters for the SEAD ensemble.
// Only used when EnsembleConfig.Method == "sead".
type SEADConfig struct {
// Eta is the MWU learning rate η ∈ (0, 1].
// Higher values react faster to distribution shifts but are noisier.
// Recommended: 0.050.20. Default (if 0): 0.10.
Eta float64 `yaml:"eta"`
// Lambda is the KL-divergence regularisation strength.
// 0 = pure MWU (uniform prior). Recommended: 0.00.05. Default: 0.01.
Lambda float64 `yaml:"lambda"`
// QuantileWindow is the number of past scores retained per detector for
// streaming quantile normalisation. Default (if 0): 300.
QuantileWindow int `yaml:"quantile_window"`
// MinDataPoints is the cold-start guard: no anomaly is flagged until at
// least this many windows have been scored. Default (if 0): 20.
MinDataPoints int `yaml:"min_data_points"`
}
// EnsembleConfig manages the routing for the multi-model detector.
type EnsembleConfig struct {
Enabled bool `yaml:"enabled"`
// Method selects the score-aggregation strategy.
// Allowed values: "avg" (default), "max", "median", "sead".
// "sead": adaptive Multiplicative Weights Update ensemble (Shah et al., ICML 2025).
Method string `yaml:"method"`
// Contamination is the expected fraction of anomalous windows ∈ [0, 0.5).
// Determines the decision threshold as quantile(1-contamination) of
// the rolling combined score history.
Contamination float64 `yaml:"contamination"`
// SEAD tuning parameters (only applied when Method == "sead").
SEAD SEADConfig `yaml:"sead"`
}
// AutoScalingConfig holds thresholds and durations for dynamic detector switching.
type AutoScalingConfig struct {
Enabled bool `yaml:"enabled"`
HighThreshold float64 `yaml:"high_threshold"` // e.g. 0.75 (Normal -> High)
CritThreshold float64 `yaml:"critical_threshold"` // e.g. 0.90 (High -> Critical)
HighDuration float64 `yaml:"high_duration"` // e.g. 30.0 (seconds)
CritDuration float64 `yaml:"critical_duration"` // e.g. 15.0 (seconds)
DownThreshold float64 `yaml:"down_threshold"` // e.g. 0.50 (back to Normal)
DownDuration float64 `yaml:"down_duration"` // e.g. 60.0 (seconds)
}
// DetectorConfig groups all anomaly detection configurations.
type DetectorConfig struct {
Method string `yaml:"method"`
Ensemble EnsembleConfig `yaml:"ensemble"`
MAD MADConfig `yaml:"mad"`
COPOD COPODConfig `yaml:"copod"`
// RRCF is used by the standalone detector and the AVG/MAX/MEDIAN ensemble paths.
RRCF RRCFConfig `yaml:"rrcf"`
// RRCFVariants configures the three-horizon RRCF instances for the SEAD ensemble.
// Defaults are applied automatically when fields are zero.
RRCFVariants RRCFVariantsConfig `yaml:"rrcf_variants"`
AutoScaling AutoScalingConfig `yaml:"auto_scaling"`
}
// Config is the top-level pipeline configuration.
type Config struct {
Ingestion struct {
LogPath string `yaml:"log_path"`
NetInterface string `yaml:"net_interface"`
DiskDevice string `yaml:"disk_device"`
SystemctlServices []string `yaml:"systemctl_services"`
} `yaml:"ingestion"`
Transformation struct {
WindowSize time.Duration `yaml:"window_size"`
DbPath string `yaml:"db_path"`
} `yaml:"transformation"`
Drain struct {
Depth int `yaml:"depth"`
SimThreshold float64 `yaml:"sim_threshold"`
MaxChildren int `yaml:"max_children"`
MaxClusters int `yaml:"max_clusters"`
MaskingPatterns []MaskingPattern `yaml:"masking_patterns"`
} `yaml:"drain"`
Detection DetectorConfig `yaml:"detector"`
Output struct {
FeatureLogPath string `yaml:"feature_log_path"`
AnomalyLogPath string `yaml:"anomaly_log_path"`
} `yaml:"output"`
}
// LoadConfig reads and decodes the YAML file at path.
func LoadConfig(path string) (*Config, error) {
f, err := os.Open(path)
if err != nil {
return nil, fmt.Errorf("config: open %q: %w", path, err)
}
defer f.Close()
var cfg Config
dec := yaml.NewDecoder(f)
dec.KnownFields(false)
if err := dec.Decode(&cfg); err != nil {
return nil, fmt.Errorf("config: decode %q: %w", path, err)
}
return &cfg, nil
}
// Compile compiles all MaskingPattern.Pattern strings into *regexp.Regexp.
func (c *Config) Compile() error {
for i := range c.Drain.MaskingPatterns {
mp := &c.Drain.MaskingPatterns[i]
re, err := regexp.Compile(mp.Pattern)
if err != nil {
return fmt.Errorf("config: compile pattern %q: %w", mp.Name, err)
}
mp.Re = re
}
return nil
}
// NumericPatternNames returns the ordered list of MaskingPattern names whose
// Type is "float" or "int".
func (c *Config) NumericPatternNames() []string {
names := make([]string, 0, len(c.Drain.MaskingPatterns))
for _, mp := range c.Drain.MaskingPatterns {
if mp.Name != "" && (mp.Type == "float" || mp.Type == "int") {
names = append(names, mp.Name)
}
}
return names
}