guenther/internal/detect/mad.go

254 lines
8.4 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// Package detect provides anomaly detection algorithms and ensemble logic.
package detect
import (
"log"
"math"
"sort"
"sync"
"codeberg.org/pata1704/guenther/pkg/types"
)
// MADDetector scores feature vectors using per-feature Median Absolute
// Deviation (MAD) with pre-calibrated or automatically derived statistics.
//
// Pass nil for medians and mads and set calibrationSize > 0 via
// NewMADDetectorAutoCalibrate. The detector buffers the first calibrationSize
// NormalizedVectors, computes per-feature statistics once the buffer is full,
// and starts scoring normally afterwards. During the warmup phase Score
// returns score=0 / IsAnomaly=false.
//
// detector := NewMADDetectorAutoCalibrate(3.5, 100)
//
// SEAD down-weights MAD automatically during the warmup phase because
// all scores are zero; once calibration completes SEAD will start to
// consider MAD scores in its weight updates.
//
// # Calibration contract
//
// The medians and mads slices must be computed from the SAME representation
// that arrives in vector.NormalizedVector i.e. from the RobustScaler-scaled
// feature vectors, NOT from raw window aggregates.
//
// # Scoring
//
// For each feature i the modified Z-score is:
//
// score_i = |x_i - median_i| / (1.4826 * MAD_i)
//
// The constant 1.4826 ≈ 1/(Φ⁻¹(3/4)) makes MAD a consistent estimator of σ
// under normality (Rousseeuw & Croux, 1993). The anomaly score is the maximum
// modified Z-score across all features.
//
// # Fit / Update
//
// When calibration is already complete, Fit replaces the
// current statistics with values derived from the supplied vectors. Update is a
// no-op.
type MADDetector struct {
mu sync.Mutex
threshold float64
medians []float64 // per-feature median of NormalizedVector in baseline
mads []float64 // per-feature MAD of NormalizedVector in baseline
// Auto-calibration state. calibrationSize == 0 means disabled.
calibrationSize int
calibrationBuf [][]float64 // collected NormalizedVectors during warmup
calibrated bool
}
// NewMADDetector creates a MADDetector with pre-calibrated baseline statistics.
//
// - threshold: anomaly score cutoff (modified Z-score). Typical: 2.54.0.
// - medians: per-feature median computed from NormalizedVector in baseline.
// - mads: per-feature MAD computed from NormalizedVector in baseline.
// Zero entries are replaced with 1.0 to avoid division-by-zero.
//
// Pass nil for medians and mads only when calibrationSize > 0 is set via
// NewMADDetectorAutoCalibrate; otherwise all scores will be zero.
func NewMADDetector(threshold float64, medians, mads []float64) *MADDetector {
return &MADDetector{
threshold: threshold,
medians: medians,
mads: mads,
calibrated: len(medians) > 0,
}
}
// NewMADDetectorAutoCalibrate creates a MADDetector that derives its own
// per-feature statistics from the first calibrationSize NormalizedVectors
// it encounters in Score.
//
// - threshold: modified Z-score cutoff after calibration. Typical: 3.5.
// - calibrationSize: number of vectors to buffer before first calibration.
// Recommended: 60200
func NewMADDetectorAutoCalibrate(threshold float64, calibrationSize int) *MADDetector {
if calibrationSize <= 0 {
calibrationSize = 100
}
// Initialise with "Identity" stats (median=0, mad=1) so the detector is
// operational immediately with a global sensitivity of 1.0 (baseline IQR).
// Features are already RobustScaled by DuckDB, so this is a sane prior.
// Automatic calibration will refine these once the buffer is full.
return &MADDetector{
threshold: threshold,
calibrationSize: calibrationSize,
medians: nil, // will be Lazy-init or from buffer
mads: nil,
}
}
// Fit recomputes per-feature median and MAD from the supplied vectors,
// replacing any prior calibration. Safe to call concurrently with Score.
func (m *MADDetector) Fit(vectors []types.FeatureVector) error {
if len(vectors) == 0 {
return nil
}
raw := make([][]float64, len(vectors))
for i, v := range vectors {
raw[i] = v.NormalizedVector
}
medians, mads := computeMADStats(raw)
m.mu.Lock()
m.medians = medians
m.mads = mads
m.calibrated = true
m.calibrationBuf = nil
m.mu.Unlock()
log.Printf("mad: fitted on %d vectors (%d features)", len(vectors), len(medians))
return nil
}
// Update is a no-op when manual statistics are used. When auto-calibration is
// active it is equivalent to calling Score but discards the result.
func (m *MADDetector) Update(v types.FeatureVector) error {
_, _ = m.Score(v)
return nil
}
// Score computes the maximum modified Z-score across all features of vector.
//
// During the auto-calibration warmup the vector is buffered and a zero-score
// result is returned. Once the calibration buffer is full the statistics are
// derived automatically and scoring starts on the next call.
//
// vector.NormalizedVector must contain values on the same scale as the
// medians and mads slices (i.e. RobustScaler-scaled values from DuckDB).
func (m *MADDetector) Score(vector types.FeatureVector) (types.AnomalyResult, error) {
m.mu.Lock()
// ── Auto-calibration warmup ───────────────────────────────────────────
if !m.calibrated && m.calibrationSize > 0 {
if vec := vector.NormalizedVector; len(vec) > 0 {
cp := make([]float64, len(vec))
copy(cp, vec)
m.calibrationBuf = append(m.calibrationBuf, cp)
}
if len(m.calibrationBuf) >= m.calibrationSize {
m.medians, m.mads = computeMADStats(m.calibrationBuf)
m.calibrated = true
m.calibrationBuf = nil
log.Printf("mad: auto-calibrated on %d vectors (%d features)",
m.calibrationSize, len(m.medians))
}
if !m.calibrated {
m.mu.Unlock()
return m.scoreIdentity(vector), nil
}
}
medians := m.medians
mads := m.mads
m.mu.Unlock()
// ── Scoring ───────────────────────────────────────────────────────────
maxScore := 0.0
for i, val := range vector.NormalizedVector {
if i >= len(medians) || i >= len(mads) {
break
}
// Stability floor: prevent explosive Z-scores for features with near-zero variance.
// 1e-2 corresponds to 1% of the original baseline IQR.
mad := math.Max(mads[i], 0.01)
// 1.4826 converts MAD to an estimator of standard deviation.
score := math.Abs(val-medians[i]) / (1.4826 * mad)
if score > maxScore {
maxScore = score
}
}
return types.AnomalyResult{
Timestamp: vector.Timestamp,
Score: maxScore,
IsAnomaly: maxScore > m.threshold,
Confidence: math.Min(maxScore/math.Max(m.threshold, 1e-9), 1.0),
Method: "MAD",
}, nil
}
// scoreIdentity provides a sane fallback (median=0, mad=1) for pre-scaled data.
func (m *MADDetector) scoreIdentity(vector types.FeatureVector) types.AnomalyResult {
maxScore := 0.0
for _, val := range vector.NormalizedVector {
score := math.Abs(val) / 0.6745 // 1/1.4826
if score > maxScore {
maxScore = score
}
}
res := types.AnomalyResult{
Timestamp: vector.Timestamp,
Score: maxScore,
IsAnomaly: maxScore > m.threshold,
Confidence: math.Min(maxScore/math.Max(m.threshold, 1e-9), 1.0),
Method: "MAD (warmup)",
}
if res.IsAnomaly {
res.Details = "Detected during MAD auto-calibration warmup period (using identity prior)."
}
return res
}
// ── calibration helper ────────────────────────────────────────────────────────
// computeMADStats returns per-feature median and MAD for a matrix of row vectors.
// Both slices have length equal to the number of features (columns).
func computeMADStats(rows [][]float64) (medians, mads []float64) {
if len(rows) == 0 {
return nil, nil
}
nFeatures := len(rows[0])
medians = make([]float64, nFeatures)
mads = make([]float64, nFeatures)
col := make([]float64, len(rows))
devs := make([]float64, len(rows))
for f := range nFeatures {
for r, row := range rows {
if f < len(row) {
col[r] = row[f]
}
}
med := median(col)
medians[f] = med
for r, v := range col {
devs[r] = math.Abs(v - med)
}
mads[f] = median(devs)
}
return medians, mads
}
// median returns the median of xs. xs is modified in-place (sorted).
func median(xs []float64) float64 {
n := len(xs)
if n == 0 {
return 0
}
sort.Float64s(xs)
if n%2 == 1 {
return xs[n/2]
}
return (xs[n/2-1] + xs[n/2]) / 2.0
}