// Package detect provides anomaly detection algorithms and ensemble logic. package detect import ( "log" "math" "sort" "sync" "codeberg.org/pata1704/guenther/pkg/types" ) // MADDetector scores feature vectors using per-feature Median Absolute // Deviation (MAD) with pre-calibrated or automatically derived statistics. // // Pass nil for medians and mads and set calibrationSize > 0 via // NewMADDetectorAutoCalibrate. The detector buffers the first calibrationSize // NormalizedVectors, computes per-feature statistics once the buffer is full, // and starts scoring normally afterwards. During the warmup phase Score // returns score=0 / IsAnomaly=false. // // detector := NewMADDetectorAutoCalibrate(3.5, 100) // // SEAD down-weights MAD automatically during the warmup phase because // all scores are zero; once calibration completes SEAD will start to // consider MAD scores in its weight updates. // // # Calibration contract // // The medians and mads slices must be computed from the SAME representation // that arrives in vector.NormalizedVector – i.e. from the RobustScaler-scaled // feature vectors, NOT from raw window aggregates. // // # Scoring // // For each feature i the modified Z-score is: // // score_i = |x_i - median_i| / (1.4826 * MAD_i) // // The constant 1.4826 ≈ 1/(Φ⁻¹(3/4)) makes MAD a consistent estimator of σ // under normality (Rousseeuw & Croux, 1993). The anomaly score is the maximum // modified Z-score across all features. // // # Fit / Update // // When calibration is already complete, Fit replaces the // current statistics with values derived from the supplied vectors. Update is a // no-op. type MADDetector struct { mu sync.Mutex threshold float64 medians []float64 // per-feature median of NormalizedVector in baseline mads []float64 // per-feature MAD of NormalizedVector in baseline // Auto-calibration state. calibrationSize == 0 means disabled. calibrationSize int calibrationBuf [][]float64 // collected NormalizedVectors during warmup calibrated bool } // NewMADDetector creates a MADDetector with pre-calibrated baseline statistics. // // - threshold: anomaly score cutoff (modified Z-score). Typical: 2.5–4.0. // - medians: per-feature median computed from NormalizedVector in baseline. // - mads: per-feature MAD computed from NormalizedVector in baseline. // Zero entries are replaced with 1.0 to avoid division-by-zero. // // Pass nil for medians and mads only when calibrationSize > 0 is set via // NewMADDetectorAutoCalibrate; otherwise all scores will be zero. func NewMADDetector(threshold float64, medians, mads []float64) *MADDetector { return &MADDetector{ threshold: threshold, medians: medians, mads: mads, calibrated: len(medians) > 0, } } // NewMADDetectorAutoCalibrate creates a MADDetector that derives its own // per-feature statistics from the first calibrationSize NormalizedVectors // it encounters in Score. // // - threshold: modified Z-score cutoff after calibration. Typical: 3.5. // - calibrationSize: number of vectors to buffer before first calibration. // Recommended: 60–200 func NewMADDetectorAutoCalibrate(threshold float64, calibrationSize int) *MADDetector { if calibrationSize <= 0 { calibrationSize = 100 } // Initialise with "Identity" stats (median=0, mad=1) so the detector is // operational immediately with a global sensitivity of 1.0 (baseline IQR). // Features are already RobustScaled by DuckDB, so this is a sane prior. // Automatic calibration will refine these once the buffer is full. return &MADDetector{ threshold: threshold, calibrationSize: calibrationSize, medians: nil, // will be Lazy-init or from buffer mads: nil, } } // Fit recomputes per-feature median and MAD from the supplied vectors, // replacing any prior calibration. Safe to call concurrently with Score. func (m *MADDetector) Fit(vectors []types.FeatureVector) error { if len(vectors) == 0 { return nil } raw := make([][]float64, len(vectors)) for i, v := range vectors { raw[i] = v.NormalizedVector } medians, mads := computeMADStats(raw) m.mu.Lock() m.medians = medians m.mads = mads m.calibrated = true m.calibrationBuf = nil m.mu.Unlock() log.Printf("mad: fitted on %d vectors (%d features)", len(vectors), len(medians)) return nil } // Update is a no-op when manual statistics are used. When auto-calibration is // active it is equivalent to calling Score but discards the result. func (m *MADDetector) Update(v types.FeatureVector) error { _, _ = m.Score(v) return nil } // Score computes the maximum modified Z-score across all features of vector. // // During the auto-calibration warmup the vector is buffered and a zero-score // result is returned. Once the calibration buffer is full the statistics are // derived automatically and scoring starts on the next call. // // vector.NormalizedVector must contain values on the same scale as the // medians and mads slices (i.e. RobustScaler-scaled values from DuckDB). func (m *MADDetector) Score(vector types.FeatureVector) (types.AnomalyResult, error) { m.mu.Lock() // ── Auto-calibration warmup ─────────────────────────────────────────── if !m.calibrated && m.calibrationSize > 0 { if vec := vector.NormalizedVector; len(vec) > 0 { cp := make([]float64, len(vec)) copy(cp, vec) m.calibrationBuf = append(m.calibrationBuf, cp) } if len(m.calibrationBuf) >= m.calibrationSize { m.medians, m.mads = computeMADStats(m.calibrationBuf) m.calibrated = true m.calibrationBuf = nil log.Printf("mad: auto-calibrated on %d vectors (%d features)", m.calibrationSize, len(m.medians)) } if !m.calibrated { m.mu.Unlock() return m.scoreIdentity(vector), nil } } medians := m.medians mads := m.mads m.mu.Unlock() // ── Scoring ─────────────────────────────────────────────────────────── maxScore := 0.0 for i, val := range vector.NormalizedVector { if i >= len(medians) || i >= len(mads) { break } // Stability floor: prevent explosive Z-scores for features with near-zero variance. // 1e-2 corresponds to 1% of the original baseline IQR. mad := math.Max(mads[i], 0.01) // 1.4826 converts MAD to an estimator of standard deviation. score := math.Abs(val-medians[i]) / (1.4826 * mad) if score > maxScore { maxScore = score } } return types.AnomalyResult{ Timestamp: vector.Timestamp, Score: maxScore, IsAnomaly: maxScore > m.threshold, Confidence: math.Min(maxScore/math.Max(m.threshold, 1e-9), 1.0), Method: "MAD", }, nil } // scoreIdentity provides a sane fallback (median=0, mad=1) for pre-scaled data. func (m *MADDetector) scoreIdentity(vector types.FeatureVector) types.AnomalyResult { maxScore := 0.0 for _, val := range vector.NormalizedVector { score := math.Abs(val) / 0.6745 // 1/1.4826 if score > maxScore { maxScore = score } } res := types.AnomalyResult{ Timestamp: vector.Timestamp, Score: maxScore, IsAnomaly: maxScore > m.threshold, Confidence: math.Min(maxScore/math.Max(m.threshold, 1e-9), 1.0), Method: "MAD (warmup)", } if res.IsAnomaly { res.Details = "Detected during MAD auto-calibration warmup period (using identity prior)." } return res } // ── calibration helper ──────────────────────────────────────────────────────── // computeMADStats returns per-feature median and MAD for a matrix of row vectors. // Both slices have length equal to the number of features (columns). func computeMADStats(rows [][]float64) (medians, mads []float64) { if len(rows) == 0 { return nil, nil } nFeatures := len(rows[0]) medians = make([]float64, nFeatures) mads = make([]float64, nFeatures) col := make([]float64, len(rows)) devs := make([]float64, len(rows)) for f := range nFeatures { for r, row := range rows { if f < len(row) { col[r] = row[f] } } med := median(col) medians[f] = med for r, v := range col { devs[r] = math.Abs(v - med) } mads[f] = median(devs) } return medians, mads } // median returns the median of xs. xs is modified in-place (sorted). func median(xs []float64) float64 { n := len(xs) if n == 0 { return 0 } sort.Float64s(xs) if n%2 == 1 { return xs[n/2] } return (xs[n/2-1] + xs[n/2]) / 2.0 }