203 lines
7.1 KiB
Go
203 lines
7.1 KiB
Go
// Package config provides the pipeline configuration loaded from YAML.
|
||
package config
|
||
|
||
import (
|
||
"fmt"
|
||
"os"
|
||
"regexp"
|
||
"time"
|
||
|
||
"gopkg.in/yaml.v3"
|
||
)
|
||
|
||
// MaskingPattern is a single entry in drain.masking_patterns.
|
||
type MaskingPattern struct {
|
||
Name string `yaml:"name"`
|
||
Pattern string `yaml:"pattern"`
|
||
Replace string `yaml:"replace"`
|
||
Type string `yaml:"type"`
|
||
Re *regexp.Regexp
|
||
}
|
||
|
||
// MADConfig defines parameters for the MAD detector.
|
||
type MADConfig struct {
|
||
// Threshold is the modified Z-score cutoff for IsAnomaly.
|
||
// Recommended: 3.0–4.0. Default: 3.5.
|
||
Threshold float64 `yaml:"threshold"`
|
||
// CalibrationSize is the number of NormalizedVectors to buffer before
|
||
// automatic per-feature median/MAD calibration runs.
|
||
// Default (if 0): 100.
|
||
CalibrationSize int `yaml:"calibration_size"`
|
||
}
|
||
|
||
// COPODConfig defines the parameters for the Copula-Based Outlier detector.
|
||
type COPODConfig struct {
|
||
Threshold float64 `yaml:"threshold"`
|
||
BufferSize int `yaml:"buffer_size"`
|
||
}
|
||
|
||
// RRCFConfig defines the parameters for the Robust Random Cut Forest detector.
|
||
// Used for the standalone RRCF detector and the classic AVG/MAX/MEDIAN ensemble paths.
|
||
type RRCFConfig struct {
|
||
NumTrees int `yaml:"num_trees"`
|
||
TreeSize int `yaml:"tree_size"`
|
||
ThresholdPercentile float64 `yaml:"threshold_percentile"`
|
||
}
|
||
|
||
// RRCFVariantConfig holds parameters for a single named RRCF variant inside
|
||
// the SEAD multi-horizon ensemble.
|
||
type RRCFVariantConfig struct {
|
||
// NumTrees controls score stability: more trees → smoother/conservative.
|
||
NumTrees int `yaml:"num_trees"`
|
||
// TreeSize sets the sliding-window capacity per tree.
|
||
TreeSize int `yaml:"tree_size"`
|
||
// ThresholdPercentile is the per-model decision threshold (standalone use).
|
||
ThresholdPercentile float64 `yaml:"threshold_percentile"`
|
||
}
|
||
|
||
// RRCFVariantsConfig groups the three RRCF variants used by the SEAD ensemble.
|
||
// Each variant captures anomalies at a different time-horizon:
|
||
// - Fast: short memory, reactive to transient spikes
|
||
// - Mid: medium memory, balanced sensitivity
|
||
// - Slow: long memory, detects sustained / slow-drift events
|
||
type RRCFVariantsConfig struct {
|
||
Fast RRCFVariantConfig `yaml:"fast"`
|
||
Mid RRCFVariantConfig `yaml:"mid"`
|
||
Slow RRCFVariantConfig `yaml:"slow"`
|
||
}
|
||
|
||
// SEADConfig holds tunable parameters for the SEAD ensemble.
|
||
// Only used when EnsembleConfig.Method == "sead".
|
||
type SEADConfig struct {
|
||
// Eta is the MWU learning rate η ∈ (0, 1].
|
||
// Higher values react faster to distribution shifts but are noisier.
|
||
// Recommended: 0.05–0.20. Default (if 0): 0.10.
|
||
Eta float64 `yaml:"eta"`
|
||
|
||
// Lambda is the KL-divergence regularisation strength.
|
||
// 0 = pure MWU (uniform prior). Recommended: 0.0–0.05. Default: 0.01.
|
||
Lambda float64 `yaml:"lambda"`
|
||
|
||
// QuantileWindow is the number of past scores retained per detector for
|
||
// streaming quantile normalisation. Default (if 0): 300.
|
||
QuantileWindow int `yaml:"quantile_window"`
|
||
|
||
// MinDataPoints is the cold-start guard: no anomaly is flagged until at
|
||
// least this many windows have been scored. Default (if 0): 20.
|
||
MinDataPoints int `yaml:"min_data_points"`
|
||
}
|
||
|
||
// EnsembleConfig manages the routing for the multi-model detector.
|
||
type EnsembleConfig struct {
|
||
Enabled bool `yaml:"enabled"`
|
||
|
||
// Method selects the score-aggregation strategy.
|
||
// Allowed values: "avg" (default), "max", "median", "sead".
|
||
// "sead": adaptive Multiplicative Weights Update ensemble (Shah et al., ICML 2025).
|
||
Method string `yaml:"method"`
|
||
|
||
// Contamination is the expected fraction of anomalous windows ∈ [0, 0.5).
|
||
// Determines the decision threshold as quantile(1-contamination) of
|
||
// the rolling combined score history.
|
||
Contamination float64 `yaml:"contamination"`
|
||
|
||
// SEAD tuning parameters (only applied when Method == "sead").
|
||
SEAD SEADConfig `yaml:"sead"`
|
||
}
|
||
|
||
// AutoScalingConfig holds thresholds and durations for dynamic detector switching.
|
||
type AutoScalingConfig struct {
|
||
Enabled bool `yaml:"enabled"`
|
||
HighThreshold float64 `yaml:"high_threshold"` // e.g. 0.75 (Normal -> High)
|
||
CritThreshold float64 `yaml:"critical_threshold"` // e.g. 0.90 (High -> Critical)
|
||
HighDuration float64 `yaml:"high_duration"` // e.g. 30.0 (seconds)
|
||
CritDuration float64 `yaml:"critical_duration"` // e.g. 15.0 (seconds)
|
||
DownThreshold float64 `yaml:"down_threshold"` // e.g. 0.50 (back to Normal)
|
||
DownDuration float64 `yaml:"down_duration"` // e.g. 60.0 (seconds)
|
||
}
|
||
|
||
// DetectorConfig groups all anomaly detection configurations.
|
||
type DetectorConfig struct {
|
||
Method string `yaml:"method"`
|
||
Ensemble EnsembleConfig `yaml:"ensemble"`
|
||
MAD MADConfig `yaml:"mad"`
|
||
COPOD COPODConfig `yaml:"copod"`
|
||
// RRCF is used by the standalone detector and the AVG/MAX/MEDIAN ensemble paths.
|
||
RRCF RRCFConfig `yaml:"rrcf"`
|
||
// RRCFVariants configures the three-horizon RRCF instances for the SEAD ensemble.
|
||
// Defaults are applied automatically when fields are zero.
|
||
RRCFVariants RRCFVariantsConfig `yaml:"rrcf_variants"`
|
||
AutoScaling AutoScalingConfig `yaml:"auto_scaling"`
|
||
}
|
||
|
||
// Config is the top-level pipeline configuration.
|
||
type Config struct {
|
||
Ingestion struct {
|
||
LogPath string `yaml:"log_path"`
|
||
NetInterface string `yaml:"net_interface"`
|
||
DiskDevice string `yaml:"disk_device"`
|
||
SystemctlServices []string `yaml:"systemctl_services"`
|
||
} `yaml:"ingestion"`
|
||
|
||
Transformation struct {
|
||
WindowSize time.Duration `yaml:"window_size"`
|
||
DbPath string `yaml:"db_path"`
|
||
} `yaml:"transformation"`
|
||
|
||
Drain struct {
|
||
Depth int `yaml:"depth"`
|
||
SimThreshold float64 `yaml:"sim_threshold"`
|
||
MaxChildren int `yaml:"max_children"`
|
||
MaxClusters int `yaml:"max_clusters"`
|
||
MaskingPatterns []MaskingPattern `yaml:"masking_patterns"`
|
||
} `yaml:"drain"`
|
||
|
||
Detection DetectorConfig `yaml:"detector"`
|
||
|
||
Output struct {
|
||
FeatureLogPath string `yaml:"feature_log_path"`
|
||
AnomalyLogPath string `yaml:"anomaly_log_path"`
|
||
} `yaml:"output"`
|
||
}
|
||
|
||
// LoadConfig reads and decodes the YAML file at path.
|
||
func LoadConfig(path string) (*Config, error) {
|
||
f, err := os.Open(path)
|
||
if err != nil {
|
||
return nil, fmt.Errorf("config: open %q: %w", path, err)
|
||
}
|
||
defer f.Close()
|
||
|
||
var cfg Config
|
||
dec := yaml.NewDecoder(f)
|
||
dec.KnownFields(false)
|
||
if err := dec.Decode(&cfg); err != nil {
|
||
return nil, fmt.Errorf("config: decode %q: %w", path, err)
|
||
}
|
||
return &cfg, nil
|
||
}
|
||
|
||
// Compile compiles all MaskingPattern.Pattern strings into *regexp.Regexp.
|
||
func (c *Config) Compile() error {
|
||
for i := range c.Drain.MaskingPatterns {
|
||
mp := &c.Drain.MaskingPatterns[i]
|
||
re, err := regexp.Compile(mp.Pattern)
|
||
if err != nil {
|
||
return fmt.Errorf("config: compile pattern %q: %w", mp.Name, err)
|
||
}
|
||
mp.Re = re
|
||
}
|
||
return nil
|
||
}
|
||
|
||
// NumericPatternNames returns the ordered list of MaskingPattern names whose
|
||
// Type is "float" or "int".
|
||
func (c *Config) NumericPatternNames() []string {
|
||
names := make([]string, 0, len(c.Drain.MaskingPatterns))
|
||
for _, mp := range c.Drain.MaskingPatterns {
|
||
if mp.Name != "" && (mp.Type == "float" || mp.Type == "int") {
|
||
names = append(names, mp.Name)
|
||
}
|
||
}
|
||
return names
|
||
}
|