215 lines
7 KiB
Python
215 lines
7 KiB
Python
"""
|
||
00_config.py – Gemeinsame Konfiguration und Hilfsfunktionen
|
||
============================================================
|
||
Dieses Modul wird von allen Auswertungsskripten importiert.
|
||
"""
|
||
|
||
import json
|
||
import os
|
||
import re
|
||
from pathlib import Path
|
||
|
||
import matplotlib
|
||
import matplotlib.dates as mdates
|
||
import matplotlib.pyplot as plt
|
||
import numpy as np
|
||
import pandas as pd
|
||
import seaborn as sns
|
||
from scipy import stats
|
||
|
||
# ── Pfade ────────────────────────────────────────────────────────────────────
|
||
ROOT = Path(os.environ.get("ANALYSIS_ROOT", "./data"))
|
||
OUTPUT_DIR = Path(os.environ.get("ANALYSIS_OUTPUT", "./output"))
|
||
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
||
|
||
RUNS = {
|
||
"full_cycle_run1": ("gt_full_cycle_run1.csv", "pipeline_full_cycle_run1"),
|
||
"full_cycle_run2": ("gt_full_cycle_run2.csv", "pipeline_full_cycle_run2"),
|
||
"full_cycle_run3": ("gt_full_cycle_run3.csv", "pipeline_full_cycle_run3"),
|
||
"high_bw_run1": ("gt_high-bw_run1.csv", "pipeline_high-bw_run1"),
|
||
"high_bw_run2": ("gt_high-bw_run2.csv", "pipeline_high-bw_run2"),
|
||
"high_bw_run3": ("gt_high-bw_run3.csv", "pipeline_high-bw_run3"),
|
||
"high_iops_run1": ("gt_high-iops_run1.csv", "pipeline_high-iops_run1"),
|
||
"high_iops_run2": ("gt_high-iops_run2.csv", "pipeline_high-iops_run2"),
|
||
"high_iops_run3": ("gt_high-iops_run3.csv", "pipeline_high-iops_run3"),
|
||
"batch_out_run1": ("gt_batch-out_run1.csv", "pipeline_batch-out_run1"),
|
||
"batch_out_run2": ("gt_batch-out_run2.csv", "pipeline_batch-out_run2"),
|
||
"batch_out_run3": ("gt_batch-out_run3.csv", "pipeline_batch-out_run3"),
|
||
"validation_run1": ("gt_validation_run1.csv", "pipeline_validation_run1"),
|
||
"validation_run2": ("gt_validation_run2.csv", "pipeline_validation_run2"),
|
||
"validation_run3": ("gt_validation_run3.csv", "pipeline_validation_run3"),
|
||
}
|
||
|
||
VALIDATION_RUNS = ["validation_run1", "validation_run2", "validation_run3"]
|
||
|
||
WORKLOAD_PROFILES = {
|
||
"full_cycle": ["full_cycle_run1", "full_cycle_run2", "full_cycle_run3"],
|
||
"high_bw": ["high_bw_run1", "high_bw_run2", "high_bw_run3"],
|
||
"high_iops": ["high_iops_run1", "high_iops_run2", "high_iops_run3"],
|
||
"batch_out": ["batch_out_run1", "batch_out_run2", "batch_out_run3"],
|
||
}
|
||
|
||
SCENARIO_IDS = [
|
||
"slow-connection",
|
||
"high-latency",
|
||
"packet-loss",
|
||
"congestion",
|
||
"partial-outage",
|
||
"flapping",
|
||
"cpu-stress",
|
||
"io-stress",
|
||
"mem-stress",
|
||
]
|
||
|
||
ALPHA = 0.05
|
||
PERCENTILES = [50, 75, 90, 95, 99]
|
||
|
||
# ── Plot-Stil ─────────────────────────────────────────────────────────────────
|
||
plt.rcParams.update(
|
||
{
|
||
"figure.dpi": 150,
|
||
"figure.figsize": (10, 5),
|
||
"font.family": "serif",
|
||
"font.size": 11,
|
||
"axes.titlesize": 12,
|
||
"axes.labelsize": 11,
|
||
"xtick.labelsize": 9,
|
||
"ytick.labelsize": 9,
|
||
"legend.fontsize": 9,
|
||
"axes.grid": True,
|
||
"grid.alpha": 0.3,
|
||
"axes.spines.top": False,
|
||
"axes.spines.right": False,
|
||
"savefig.bbox": "tight",
|
||
"savefig.dpi": 300,
|
||
}
|
||
)
|
||
|
||
PALETTE = {
|
||
"baseline": "#4878CF",
|
||
"pipeline": "#6ACC65",
|
||
"anomaly": "#D65F5F",
|
||
"normal": "#B8B8B8",
|
||
}
|
||
|
||
# ── Hilfsfunktionen ───────────────────────────────────────────────────────────
|
||
|
||
|
||
def load_gt(run_key: str) -> pd.DataFrame:
|
||
"""Lädt die Ground-Truth-CSV eines Laufs und parst Zeitstempel."""
|
||
gt_file, _ = RUNS[run_key]
|
||
df = pd.read_csv(ROOT / gt_file, parse_dates=["timestamp"])
|
||
df["run"] = run_key
|
||
return df
|
||
|
||
|
||
def load_anomalies(run_key: str) -> pd.DataFrame:
|
||
"""
|
||
Lädt anomalies.jsonl und parst die 'details'-Spalte in separate Spalten
|
||
pro Detektor (weight und score).
|
||
"""
|
||
_, pipeline_dir = RUNS[run_key]
|
||
path = ROOT / pipeline_dir / "anomalies.jsonl"
|
||
records = []
|
||
with open(path) as f:
|
||
for line in f:
|
||
line = line.strip()
|
||
if line:
|
||
records.append(json.loads(line))
|
||
df = pd.DataFrame(records)
|
||
df["timestamp"] = pd.to_datetime(df["timestamp"], utc=True)
|
||
df["run"] = run_key
|
||
|
||
detector_pattern = re.compile(
|
||
r"(MAD|RRCF-fast|RRCF-mid|RRCF-slow|COPOD)!?:w=([\d.]+),s=([\d.]+)"
|
||
)
|
||
|
||
def parse_details(detail_str):
|
||
result = {}
|
||
for m in detector_pattern.finditer(str(detail_str)):
|
||
name, w, s = m.group(1), float(m.group(2)), float(m.group(3))
|
||
result[f"{name}_weight"] = w
|
||
result[f"{name}_score"] = s
|
||
return pd.Series(result)
|
||
|
||
detail_cols = df["details"].apply(parse_details)
|
||
df = pd.concat([df.drop(columns=["details"]), detail_cols], axis=1)
|
||
return df
|
||
|
||
|
||
def load_baseline_metrics(run_key: str) -> pd.DataFrame:
|
||
"""Lädt baseline_metrics.csv (unabhängiger metrics-collector)."""
|
||
_, pipeline_dir = RUNS[run_key]
|
||
path = ROOT / pipeline_dir / "baseline_metrics.csv"
|
||
df = pd.read_csv(path)
|
||
df["timestamp"] = pd.to_datetime(df["timestamp"], unit="s", utc=True)
|
||
df["run"] = run_key
|
||
return df
|
||
|
||
|
||
def load_duckdb_table(run_key: str, table: str) -> pd.DataFrame:
|
||
"""
|
||
Lädt eine Tabelle aus pipeline.duckdb via DuckDB-Python-Client.
|
||
"""
|
||
import duckdb
|
||
|
||
_, pipeline_dir = RUNS[run_key]
|
||
db_path = str(ROOT / pipeline_dir / "pipeline.duckdb")
|
||
con = duckdb.connect(db_path, read_only=True)
|
||
df = con.execute(f"SELECT * FROM {table}").df()
|
||
con.close()
|
||
df["run"] = run_key
|
||
return df
|
||
|
||
|
||
def rosenthal_r(stat: float, n1: int, n2: int) -> float:
|
||
"""
|
||
Berechnet die Effektstärke r nach Rosenthal (1991).
|
||
r = Z / sqrt(N)
|
||
Da mannwhitneyu oft nur U liefert, approximieren wir Z.
|
||
"""
|
||
n = n1 + n2
|
||
mu_u = (n1 * n2) / 2
|
||
sigma_u = np.sqrt((n1 * n2 * (n1 + n2 + 1)) / 12)
|
||
if sigma_u == 0:
|
||
return 0.0
|
||
z = (stat - mu_u) / sigma_u
|
||
return abs(z / np.sqrt(n))
|
||
|
||
|
||
def wilcoxon_test(a: np.ndarray, b: np.ndarray) -> tuple[float, float]:
|
||
"""
|
||
Wilcoxon-Vorzeichen-Rang-Test für gepaarte Stichproben.
|
||
Gibt (statistik, p_wert) zurück.
|
||
Fallback auf Mann-Whitney-U wenn Längen ungleich.
|
||
"""
|
||
if len(a) == len(b):
|
||
stat, p = stats.wilcoxon(a, b, alternative="two-sided", zero_method="wilcox")
|
||
else:
|
||
stat, p = stats.mannwhitneyu(a, b, alternative="two-sided")
|
||
return float(stat), float(p)
|
||
|
||
|
||
def effect_size_label(r: float) -> str:
|
||
"""Klassifikation der Effektstärke nach Cohen (1988) für r."""
|
||
r = abs(r)
|
||
if r < 0.1:
|
||
return "vernachlässigbar"
|
||
if r < 0.3:
|
||
return "klein"
|
||
if r < 0.5:
|
||
return "mittel"
|
||
return "groß"
|
||
|
||
|
||
def save_fig(name: str):
|
||
"""Speichert die aktuelle Figure als PDF und PNG."""
|
||
for ext in ("pdf", "png"):
|
||
plt.savefig(OUTPUT_DIR / f"{name}.{ext}")
|
||
plt.close()
|
||
|
||
|
||
def print_section(title: str):
|
||
print(f"\n{'=' * 60}")
|
||
print(f" {title}")
|
||
print("=" * 60)
|