guenther/internal/config/config.go

203 lines
7.1 KiB
Go
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// Package config provides the pipeline configuration loaded from YAML.
package config
import (
"fmt"
"os"
"regexp"
"time"
"gopkg.in/yaml.v3"
)
// MaskingPattern is a single entry in drain.masking_patterns.
type MaskingPattern struct {
Name string `yaml:"name"`
Pattern string `yaml:"pattern"`
Replace string `yaml:"replace"`
Type string `yaml:"type"`
Re *regexp.Regexp
}
// MADConfig defines parameters for the MAD detector.
type MADConfig struct {
// Threshold is the modified Z-score cutoff for IsAnomaly.
// Recommended: 3.04.0. Default: 3.5.
Threshold float64 `yaml:"threshold"`
// CalibrationSize is the number of NormalizedVectors to buffer before
// automatic per-feature median/MAD calibration runs.
// Default (if 0): 100.
CalibrationSize int `yaml:"calibration_size"`
}
// COPODConfig defines the parameters for the Copula-Based Outlier detector.
type COPODConfig struct {
Threshold float64 `yaml:"threshold"`
BufferSize int `yaml:"buffer_size"`
}
// RRCFConfig defines the parameters for the Robust Random Cut Forest detector.
// Used for the standalone RRCF detector and the classic AVG/MAX/MEDIAN ensemble paths.
type RRCFConfig struct {
NumTrees int `yaml:"num_trees"`
TreeSize int `yaml:"tree_size"`
ThresholdPercentile float64 `yaml:"threshold_percentile"`
}
// RRCFVariantConfig holds parameters for a single named RRCF variant inside
// the SEAD multi-horizon ensemble.
type RRCFVariantConfig struct {
// NumTrees controls score stability: more trees → smoother/conservative.
NumTrees int `yaml:"num_trees"`
// TreeSize sets the sliding-window capacity per tree.
TreeSize int `yaml:"tree_size"`
// ThresholdPercentile is the per-model decision threshold (standalone use).
ThresholdPercentile float64 `yaml:"threshold_percentile"`
}
// RRCFVariantsConfig groups the three RRCF variants used by the SEAD ensemble.
// Each variant captures anomalies at a different time-horizon:
// - Fast: short memory, reactive to transient spikes
// - Mid: medium memory, balanced sensitivity
// - Slow: long memory, detects sustained / slow-drift events
type RRCFVariantsConfig struct {
Fast RRCFVariantConfig `yaml:"fast"`
Mid RRCFVariantConfig `yaml:"mid"`
Slow RRCFVariantConfig `yaml:"slow"`
}
// SEADConfig holds tunable parameters for the SEAD ensemble.
// Only used when EnsembleConfig.Method == "sead".
type SEADConfig struct {
// Eta is the MWU learning rate η ∈ (0, 1].
// Higher values react faster to distribution shifts but are noisier.
// Recommended: 0.050.20. Default (if 0): 0.10.
Eta float64 `yaml:"eta"`
// Lambda is the KL-divergence regularisation strength.
// 0 = pure MWU (uniform prior). Recommended: 0.00.05. Default: 0.01.
Lambda float64 `yaml:"lambda"`
// QuantileWindow is the number of past scores retained per detector for
// streaming quantile normalisation. Default (if 0): 300.
QuantileWindow int `yaml:"quantile_window"`
// MinDataPoints is the cold-start guard: no anomaly is flagged until at
// least this many windows have been scored. Default (if 0): 20.
MinDataPoints int `yaml:"min_data_points"`
}
// EnsembleConfig manages the routing for the multi-model detector.
type EnsembleConfig struct {
Enabled bool `yaml:"enabled"`
// Method selects the score-aggregation strategy.
// Allowed values: "avg" (default), "max", "median", "sead".
// "sead": adaptive Multiplicative Weights Update ensemble (Shah et al., ICML 2025).
Method string `yaml:"method"`
// Contamination is the expected fraction of anomalous windows ∈ [0, 0.5).
// Determines the decision threshold as quantile(1-contamination) of
// the rolling combined score history.
Contamination float64 `yaml:"contamination"`
// SEAD tuning parameters (only applied when Method == "sead").
SEAD SEADConfig `yaml:"sead"`
}
// AutoScalingConfig holds thresholds and durations for dynamic detector switching.
type AutoScalingConfig struct {
Enabled bool `yaml:"enabled"`
HighThreshold float64 `yaml:"high_threshold"` // e.g. 0.75 (Normal -> High)
CritThreshold float64 `yaml:"critical_threshold"` // e.g. 0.90 (High -> Critical)
HighDuration float64 `yaml:"high_duration"` // e.g. 30.0 (seconds)
CritDuration float64 `yaml:"critical_duration"` // e.g. 15.0 (seconds)
DownThreshold float64 `yaml:"down_threshold"` // e.g. 0.50 (back to Normal)
DownDuration float64 `yaml:"down_duration"` // e.g. 60.0 (seconds)
}
// DetectorConfig groups all anomaly detection configurations.
type DetectorConfig struct {
Method string `yaml:"method"`
Ensemble EnsembleConfig `yaml:"ensemble"`
MAD MADConfig `yaml:"mad"`
COPOD COPODConfig `yaml:"copod"`
// RRCF is used by the standalone detector and the AVG/MAX/MEDIAN ensemble paths.
RRCF RRCFConfig `yaml:"rrcf"`
// RRCFVariants configures the three-horizon RRCF instances for the SEAD ensemble.
// Defaults are applied automatically when fields are zero.
RRCFVariants RRCFVariantsConfig `yaml:"rrcf_variants"`
AutoScaling AutoScalingConfig `yaml:"auto_scaling"`
}
// Config is the top-level pipeline configuration.
type Config struct {
Ingestion struct {
LogPath string `yaml:"log_path"`
NetInterface string `yaml:"net_interface"`
DiskDevice string `yaml:"disk_device"`
SystemctlServices []string `yaml:"systemctl_services"`
} `yaml:"ingestion"`
Transformation struct {
WindowSize time.Duration `yaml:"window_size"`
DbPath string `yaml:"db_path"`
} `yaml:"transformation"`
Drain struct {
Depth int `yaml:"depth"`
SimThreshold float64 `yaml:"sim_threshold"`
MaxChildren int `yaml:"max_children"`
MaxClusters int `yaml:"max_clusters"`
MaskingPatterns []MaskingPattern `yaml:"masking_patterns"`
} `yaml:"drain"`
Detection DetectorConfig `yaml:"detector"`
Output struct {
FeatureLogPath string `yaml:"feature_log_path"`
AnomalyLogPath string `yaml:"anomaly_log_path"`
} `yaml:"output"`
}
// LoadConfig reads and decodes the YAML file at path.
func LoadConfig(path string) (*Config, error) {
f, err := os.Open(path)
if err != nil {
return nil, fmt.Errorf("config: open %q: %w", path, err)
}
defer f.Close()
var cfg Config
dec := yaml.NewDecoder(f)
dec.KnownFields(false)
if err := dec.Decode(&cfg); err != nil {
return nil, fmt.Errorf("config: decode %q: %w", path, err)
}
return &cfg, nil
}
// Compile compiles all MaskingPattern.Pattern strings into *regexp.Regexp.
func (c *Config) Compile() error {
for i := range c.Drain.MaskingPatterns {
mp := &c.Drain.MaskingPatterns[i]
re, err := regexp.Compile(mp.Pattern)
if err != nil {
return fmt.Errorf("config: compile pattern %q: %w", mp.Name, err)
}
mp.Re = re
}
return nil
}
// NumericPatternNames returns the ordered list of MaskingPattern names whose
// Type is "float" or "int".
func (c *Config) NumericPatternNames() []string {
names := make([]string, 0, len(c.Drain.MaskingPatterns))
for _, mp := range c.Drain.MaskingPatterns {
if mp.Name != "" && (mp.Type == "float" || mp.Type == "int") {
names = append(names, mp.Name)
}
}
return names
}