254 lines
8.4 KiB
Go
254 lines
8.4 KiB
Go
// Package detect provides anomaly detection algorithms and ensemble logic.
|
||
package detect
|
||
|
||
import (
|
||
"log"
|
||
"math"
|
||
"sort"
|
||
"sync"
|
||
|
||
"codeberg.org/pata1704/guenther/pkg/types"
|
||
)
|
||
|
||
// MADDetector scores feature vectors using per-feature Median Absolute
|
||
// Deviation (MAD) with pre-calibrated or automatically derived statistics.
|
||
//
|
||
// Pass nil for medians and mads and set calibrationSize > 0 via
|
||
// NewMADDetectorAutoCalibrate. The detector buffers the first calibrationSize
|
||
// NormalizedVectors, computes per-feature statistics once the buffer is full,
|
||
// and starts scoring normally afterwards. During the warmup phase Score
|
||
// returns score=0 / IsAnomaly=false.
|
||
//
|
||
// detector := NewMADDetectorAutoCalibrate(3.5, 100)
|
||
//
|
||
// SEAD down-weights MAD automatically during the warmup phase because
|
||
// all scores are zero; once calibration completes SEAD will start to
|
||
// consider MAD scores in its weight updates.
|
||
//
|
||
// # Calibration contract
|
||
//
|
||
// The medians and mads slices must be computed from the SAME representation
|
||
// that arrives in vector.NormalizedVector – i.e. from the RobustScaler-scaled
|
||
// feature vectors, NOT from raw window aggregates.
|
||
//
|
||
// # Scoring
|
||
//
|
||
// For each feature i the modified Z-score is:
|
||
//
|
||
// score_i = |x_i - median_i| / (1.4826 * MAD_i)
|
||
//
|
||
// The constant 1.4826 ≈ 1/(Φ⁻¹(3/4)) makes MAD a consistent estimator of σ
|
||
// under normality (Rousseeuw & Croux, 1993). The anomaly score is the maximum
|
||
// modified Z-score across all features.
|
||
//
|
||
// # Fit / Update
|
||
//
|
||
// When calibration is already complete, Fit replaces the
|
||
// current statistics with values derived from the supplied vectors. Update is a
|
||
// no-op.
|
||
type MADDetector struct {
|
||
mu sync.Mutex
|
||
threshold float64
|
||
medians []float64 // per-feature median of NormalizedVector in baseline
|
||
mads []float64 // per-feature MAD of NormalizedVector in baseline
|
||
|
||
// Auto-calibration state. calibrationSize == 0 means disabled.
|
||
calibrationSize int
|
||
calibrationBuf [][]float64 // collected NormalizedVectors during warmup
|
||
calibrated bool
|
||
}
|
||
|
||
// NewMADDetector creates a MADDetector with pre-calibrated baseline statistics.
|
||
//
|
||
// - threshold: anomaly score cutoff (modified Z-score). Typical: 2.5–4.0.
|
||
// - medians: per-feature median computed from NormalizedVector in baseline.
|
||
// - mads: per-feature MAD computed from NormalizedVector in baseline.
|
||
// Zero entries are replaced with 1.0 to avoid division-by-zero.
|
||
//
|
||
// Pass nil for medians and mads only when calibrationSize > 0 is set via
|
||
// NewMADDetectorAutoCalibrate; otherwise all scores will be zero.
|
||
func NewMADDetector(threshold float64, medians, mads []float64) *MADDetector {
|
||
return &MADDetector{
|
||
threshold: threshold,
|
||
medians: medians,
|
||
mads: mads,
|
||
calibrated: len(medians) > 0,
|
||
}
|
||
}
|
||
|
||
// NewMADDetectorAutoCalibrate creates a MADDetector that derives its own
|
||
// per-feature statistics from the first calibrationSize NormalizedVectors
|
||
// it encounters in Score.
|
||
//
|
||
// - threshold: modified Z-score cutoff after calibration. Typical: 3.5.
|
||
// - calibrationSize: number of vectors to buffer before first calibration.
|
||
// Recommended: 60–200
|
||
func NewMADDetectorAutoCalibrate(threshold float64, calibrationSize int) *MADDetector {
|
||
if calibrationSize <= 0 {
|
||
calibrationSize = 100
|
||
}
|
||
// Initialise with "Identity" stats (median=0, mad=1) so the detector is
|
||
// operational immediately with a global sensitivity of 1.0 (baseline IQR).
|
||
// Features are already RobustScaled by DuckDB, so this is a sane prior.
|
||
// Automatic calibration will refine these once the buffer is full.
|
||
return &MADDetector{
|
||
threshold: threshold,
|
||
calibrationSize: calibrationSize,
|
||
medians: nil, // will be Lazy-init or from buffer
|
||
mads: nil,
|
||
}
|
||
}
|
||
|
||
// Fit recomputes per-feature median and MAD from the supplied vectors,
|
||
// replacing any prior calibration. Safe to call concurrently with Score.
|
||
func (m *MADDetector) Fit(vectors []types.FeatureVector) error {
|
||
if len(vectors) == 0 {
|
||
return nil
|
||
}
|
||
raw := make([][]float64, len(vectors))
|
||
for i, v := range vectors {
|
||
raw[i] = v.NormalizedVector
|
||
}
|
||
medians, mads := computeMADStats(raw)
|
||
|
||
m.mu.Lock()
|
||
m.medians = medians
|
||
m.mads = mads
|
||
m.calibrated = true
|
||
m.calibrationBuf = nil
|
||
m.mu.Unlock()
|
||
|
||
log.Printf("mad: fitted on %d vectors (%d features)", len(vectors), len(medians))
|
||
return nil
|
||
}
|
||
|
||
// Update is a no-op when manual statistics are used. When auto-calibration is
|
||
// active it is equivalent to calling Score but discards the result.
|
||
func (m *MADDetector) Update(v types.FeatureVector) error {
|
||
_, _ = m.Score(v)
|
||
return nil
|
||
}
|
||
|
||
// Score computes the maximum modified Z-score across all features of vector.
|
||
//
|
||
// During the auto-calibration warmup the vector is buffered and a zero-score
|
||
// result is returned. Once the calibration buffer is full the statistics are
|
||
// derived automatically and scoring starts on the next call.
|
||
//
|
||
// vector.NormalizedVector must contain values on the same scale as the
|
||
// medians and mads slices (i.e. RobustScaler-scaled values from DuckDB).
|
||
func (m *MADDetector) Score(vector types.FeatureVector) (types.AnomalyResult, error) {
|
||
m.mu.Lock()
|
||
// ── Auto-calibration warmup ───────────────────────────────────────────
|
||
if !m.calibrated && m.calibrationSize > 0 {
|
||
if vec := vector.NormalizedVector; len(vec) > 0 {
|
||
cp := make([]float64, len(vec))
|
||
copy(cp, vec)
|
||
m.calibrationBuf = append(m.calibrationBuf, cp)
|
||
}
|
||
if len(m.calibrationBuf) >= m.calibrationSize {
|
||
m.medians, m.mads = computeMADStats(m.calibrationBuf)
|
||
m.calibrated = true
|
||
m.calibrationBuf = nil
|
||
log.Printf("mad: auto-calibrated on %d vectors (%d features)",
|
||
m.calibrationSize, len(m.medians))
|
||
}
|
||
if !m.calibrated {
|
||
m.mu.Unlock()
|
||
return m.scoreIdentity(vector), nil
|
||
}
|
||
}
|
||
medians := m.medians
|
||
mads := m.mads
|
||
m.mu.Unlock()
|
||
|
||
// ── Scoring ───────────────────────────────────────────────────────────
|
||
maxScore := 0.0
|
||
for i, val := range vector.NormalizedVector {
|
||
if i >= len(medians) || i >= len(mads) {
|
||
break
|
||
}
|
||
// Stability floor: prevent explosive Z-scores for features with near-zero variance.
|
||
// 1e-2 corresponds to 1% of the original baseline IQR.
|
||
mad := math.Max(mads[i], 0.01)
|
||
|
||
// 1.4826 converts MAD to an estimator of standard deviation.
|
||
score := math.Abs(val-medians[i]) / (1.4826 * mad)
|
||
if score > maxScore {
|
||
maxScore = score
|
||
}
|
||
}
|
||
|
||
return types.AnomalyResult{
|
||
Timestamp: vector.Timestamp,
|
||
Score: maxScore,
|
||
IsAnomaly: maxScore > m.threshold,
|
||
Confidence: math.Min(maxScore/math.Max(m.threshold, 1e-9), 1.0),
|
||
Method: "MAD",
|
||
}, nil
|
||
}
|
||
|
||
// scoreIdentity provides a sane fallback (median=0, mad=1) for pre-scaled data.
|
||
func (m *MADDetector) scoreIdentity(vector types.FeatureVector) types.AnomalyResult {
|
||
maxScore := 0.0
|
||
for _, val := range vector.NormalizedVector {
|
||
score := math.Abs(val) / 0.6745 // 1/1.4826
|
||
if score > maxScore {
|
||
maxScore = score
|
||
}
|
||
}
|
||
res := types.AnomalyResult{
|
||
Timestamp: vector.Timestamp,
|
||
Score: maxScore,
|
||
IsAnomaly: maxScore > m.threshold,
|
||
Confidence: math.Min(maxScore/math.Max(m.threshold, 1e-9), 1.0),
|
||
Method: "MAD (warmup)",
|
||
}
|
||
if res.IsAnomaly {
|
||
res.Details = "Detected during MAD auto-calibration warmup period (using identity prior)."
|
||
}
|
||
return res
|
||
}
|
||
|
||
// ── calibration helper ────────────────────────────────────────────────────────
|
||
|
||
// computeMADStats returns per-feature median and MAD for a matrix of row vectors.
|
||
// Both slices have length equal to the number of features (columns).
|
||
func computeMADStats(rows [][]float64) (medians, mads []float64) {
|
||
if len(rows) == 0 {
|
||
return nil, nil
|
||
}
|
||
nFeatures := len(rows[0])
|
||
medians = make([]float64, nFeatures)
|
||
mads = make([]float64, nFeatures)
|
||
|
||
col := make([]float64, len(rows))
|
||
devs := make([]float64, len(rows))
|
||
for f := range nFeatures {
|
||
for r, row := range rows {
|
||
if f < len(row) {
|
||
col[r] = row[f]
|
||
}
|
||
}
|
||
med := median(col)
|
||
medians[f] = med
|
||
for r, v := range col {
|
||
devs[r] = math.Abs(v - med)
|
||
}
|
||
mads[f] = median(devs)
|
||
}
|
||
return medians, mads
|
||
}
|
||
|
||
// median returns the median of xs. xs is modified in-place (sorted).
|
||
func median(xs []float64) float64 {
|
||
n := len(xs)
|
||
if n == 0 {
|
||
return 0
|
||
}
|
||
sort.Float64s(xs)
|
||
if n%2 == 1 {
|
||
return xs[n/2]
|
||
}
|
||
return (xs[n/2-1] + xs[n/2]) / 2.0
|
||
}
|