guenther/internal/detect/mad.go

// Package detect provides anomaly detection algorithms and ensemble logic.
package detect

import (
	"log"
	"math"
	"sort"
	"sync"

	"codeberg.org/pata1704/guenther/pkg/types"
)

// MADDetector scores feature vectors using per-feature Median Absolute
// Deviation (MAD) with pre-calibrated or automatically derived statistics.
//
// Pass nil for medians and mads and set calibrationSize > 0 via
// NewMADDetectorAutoCalibrate. The detector buffers the first calibrationSize
// NormalizedVectors, computes per-feature statistics once the buffer is full,
// and starts scoring normally afterwards. During the warmup phase Score
// returns score=0 / IsAnomaly=false.
//
//	detector := NewMADDetectorAutoCalibrate(3.5, 100)
//
// SEAD down-weights MAD automatically during the warmup phase because
// all scores are zero; once calibration completes SEAD will start to
// consider MAD scores in its weight updates.
//
// # Calibration contract
//
// The medians and mads slices must be computed from the SAME representation
// that arrives in vector.NormalizedVector – i.e. from the RobustScaler-scaled
// feature vectors, NOT from raw window aggregates.
//
// # Scoring
//
// For each feature i the modified Z-score is:
//
//	score_i = |x_i - median_i| / (1.4826 * MAD_i)
//
// The constant 1.4826 ≈ 1/(Φ⁻¹(3/4)) makes MAD a consistent estimator of σ
// under normality (Rousseeuw & Croux, 1993). The anomaly score is the maximum
// modified Z-score across all features.
//
// # Fit / Update
//
// When calibration is already complete, Fit replaces the
// current statistics with values derived from the supplied vectors. Update is a
// no-op.
type MADDetector struct {
	mu        sync.Mutex
	threshold float64
	medians   []float64 // per-feature median of NormalizedVector in baseline
	mads      []float64 // per-feature MAD  of NormalizedVector in baseline

	// Auto-calibration state. calibrationSize == 0 means disabled.
	calibrationSize int
	calibrationBuf  [][]float64 // collected NormalizedVectors during warmup
	calibrated      bool
}

// NewMADDetector creates a MADDetector with pre-calibrated baseline statistics.
//
//   - threshold: anomaly score cutoff (modified Z-score). Typical: 2.5–4.0.
//   - medians:   per-feature median computed from NormalizedVector in baseline.
//   - mads:      per-feature MAD computed from NormalizedVector in baseline.
//     Zero entries are replaced with 1.0 to avoid division-by-zero.
//
// Pass nil for medians and mads only when calibrationSize > 0 is set via
// NewMADDetectorAutoCalibrate; otherwise all scores will be zero.
func NewMADDetector(threshold float64, medians, mads []float64) *MADDetector {
	return &MADDetector{
		threshold:  threshold,
		medians:    medians,
		mads:       mads,
		calibrated: len(medians) > 0,
	}
}

// NewMADDetectorAutoCalibrate creates a MADDetector that derives its own
// per-feature statistics from the first calibrationSize NormalizedVectors
// it encounters in Score.
//
//   - threshold:       modified Z-score cutoff after calibration. Typical: 3.5.
//   - calibrationSize: number of vectors to buffer before first calibration.
//     Recommended: 60–200
func NewMADDetectorAutoCalibrate(threshold float64, calibrationSize int) *MADDetector {
	if calibrationSize <= 0 {
		calibrationSize = 100
	}
	// Initialise with "Identity" stats (median=0, mad=1) so the detector is
	// operational immediately with a global sensitivity of 1.0 (baseline IQR).
	// Features are already RobustScaled by DuckDB, so this is a sane prior.
	// Automatic calibration will refine these once the buffer is full.
	return &MADDetector{
		threshold:       threshold,
		calibrationSize: calibrationSize,
		medians:         nil, // will be Lazy-init or from buffer
		mads:            nil,
	}
}

// Fit recomputes per-feature median and MAD from the supplied vectors,
// replacing any prior calibration. Safe to call concurrently with Score.
func (m *MADDetector) Fit(vectors []types.FeatureVector) error {
	if len(vectors) == 0 {
		return nil
	}
	raw := make([][]float64, len(vectors))
	for i, v := range vectors {
		raw[i] = v.NormalizedVector
	}
	medians, mads := computeMADStats(raw)

	m.mu.Lock()
	m.medians = medians
	m.mads = mads
	m.calibrated = true
	m.calibrationBuf = nil
	m.mu.Unlock()

	log.Printf("mad: fitted on %d vectors (%d features)", len(vectors), len(medians))
	return nil
}

// Update is a no-op when manual statistics are used. When auto-calibration is
// active it is equivalent to calling Score but discards the result.
func (m *MADDetector) Update(v types.FeatureVector) error {
	_, _ = m.Score(v)
	return nil
}

// Score computes the maximum modified Z-score across all features of vector.
//
// During the auto-calibration warmup the vector is buffered and a zero-score
// result is returned. Once the calibration buffer is full the statistics are
// derived automatically and scoring starts on the next call.
//
// vector.NormalizedVector must contain values on the same scale as the
// medians and mads slices (i.e. RobustScaler-scaled values from DuckDB).
func (m *MADDetector) Score(vector types.FeatureVector) (types.AnomalyResult, error) {
	m.mu.Lock()
	// ── Auto-calibration warmup ───────────────────────────────────────────
	if !m.calibrated && m.calibrationSize > 0 {
		if vec := vector.NormalizedVector; len(vec) > 0 {
			cp := make([]float64, len(vec))
			copy(cp, vec)
			m.calibrationBuf = append(m.calibrationBuf, cp)
		}
		if len(m.calibrationBuf) >= m.calibrationSize {
			m.medians, m.mads = computeMADStats(m.calibrationBuf)
			m.calibrated = true
			m.calibrationBuf = nil
			log.Printf("mad: auto-calibrated on %d vectors (%d features)",
				m.calibrationSize, len(m.medians))
		}
		if !m.calibrated {
			m.mu.Unlock()
			return m.scoreIdentity(vector), nil
		}
	}
	medians := m.medians
	mads := m.mads
	m.mu.Unlock()

	// ── Scoring ───────────────────────────────────────────────────────────
	maxScore := 0.0
	for i, val := range vector.NormalizedVector {
		if i >= len(medians) || i >= len(mads) {
			break
		}
		// Stability floor: prevent explosive Z-scores for features with near-zero variance.
		// 1e-2 corresponds to 1% of the original baseline IQR.
		mad := math.Max(mads[i], 0.01)

		// 1.4826 converts MAD to an estimator of standard deviation.
		score := math.Abs(val-medians[i]) / (1.4826 * mad)
		if score > maxScore {
			maxScore = score
		}
	}

	return types.AnomalyResult{
		Timestamp:  vector.Timestamp,
		Score:      maxScore,
		IsAnomaly:  maxScore > m.threshold,
		Confidence: math.Min(maxScore/math.Max(m.threshold, 1e-9), 1.0),
		Method:     "MAD",
	}, nil
}

// scoreIdentity provides a sane fallback (median=0, mad=1) for pre-scaled data.
func (m *MADDetector) scoreIdentity(vector types.FeatureVector) types.AnomalyResult {
	maxScore := 0.0
	for _, val := range vector.NormalizedVector {
		score := math.Abs(val) / 0.6745 // 1/1.4826
		if score > maxScore {
			maxScore = score
		}
	}
	res := types.AnomalyResult{
		Timestamp:  vector.Timestamp,
		Score:      maxScore,
		IsAnomaly:  maxScore > m.threshold,
		Confidence: math.Min(maxScore/math.Max(m.threshold, 1e-9), 1.0),
		Method:     "MAD (warmup)",
	}
	if res.IsAnomaly {
		res.Details = "Detected during MAD auto-calibration warmup period (using identity prior)."
	}
	return res
}

// ── calibration helper ────────────────────────────────────────────────────────

// computeMADStats returns per-feature median and MAD for a matrix of row vectors.
// Both slices have length equal to the number of features (columns).
func computeMADStats(rows [][]float64) (medians, mads []float64) {
	if len(rows) == 0 {
		return nil, nil
	}
	nFeatures := len(rows[0])
	medians = make([]float64, nFeatures)
	mads = make([]float64, nFeatures)

	col := make([]float64, len(rows))
	devs := make([]float64, len(rows))
	for f := range nFeatures {
		for r, row := range rows {
			if f < len(row) {
				col[r] = row[f]
			}
		}
		med := median(col)
		medians[f] = med
		for r, v := range col {
			devs[r] = math.Abs(v - med)
		}
		mads[f] = median(devs)
	}
	return medians, mads
}

// median returns the median of xs. xs is modified in-place (sorted).
func median(xs []float64) float64 {
	n := len(xs)
	if n == 0 {
		return 0
	}
	sort.Float64s(xs)
	if n%2 == 1 {
		return xs[n/2]
	}
	return (xs[n/2-1] + xs[n/2]) / 2.0
}