bachelor-thesis/evaluation/config.py

215 lines
7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
00_config.py Gemeinsame Konfiguration und Hilfsfunktionen
============================================================
Dieses Modul wird von allen Auswertungsskripten importiert.
"""
import json
import os
import re
from pathlib import Path
import matplotlib
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
# ── Pfade ────────────────────────────────────────────────────────────────────
ROOT = Path(os.environ.get("ANALYSIS_ROOT", "./data"))
OUTPUT_DIR = Path(os.environ.get("ANALYSIS_OUTPUT", "./output"))
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
RUNS = {
"full_cycle_run1": ("gt_full_cycle_run1.csv", "pipeline_full_cycle_run1"),
"full_cycle_run2": ("gt_full_cycle_run2.csv", "pipeline_full_cycle_run2"),
"full_cycle_run3": ("gt_full_cycle_run3.csv", "pipeline_full_cycle_run3"),
"high_bw_run1": ("gt_high-bw_run1.csv", "pipeline_high-bw_run1"),
"high_bw_run2": ("gt_high-bw_run2.csv", "pipeline_high-bw_run2"),
"high_bw_run3": ("gt_high-bw_run3.csv", "pipeline_high-bw_run3"),
"high_iops_run1": ("gt_high-iops_run1.csv", "pipeline_high-iops_run1"),
"high_iops_run2": ("gt_high-iops_run2.csv", "pipeline_high-iops_run2"),
"high_iops_run3": ("gt_high-iops_run3.csv", "pipeline_high-iops_run3"),
"batch_out_run1": ("gt_batch-out_run1.csv", "pipeline_batch-out_run1"),
"batch_out_run2": ("gt_batch-out_run2.csv", "pipeline_batch-out_run2"),
"batch_out_run3": ("gt_batch-out_run3.csv", "pipeline_batch-out_run3"),
"validation_run1": ("gt_validation_run1.csv", "pipeline_validation_run1"),
"validation_run2": ("gt_validation_run2.csv", "pipeline_validation_run2"),
"validation_run3": ("gt_validation_run3.csv", "pipeline_validation_run3"),
}
VALIDATION_RUNS = ["validation_run1", "validation_run2", "validation_run3"]
WORKLOAD_PROFILES = {
"full_cycle": ["full_cycle_run1", "full_cycle_run2", "full_cycle_run3"],
"high_bw": ["high_bw_run1", "high_bw_run2", "high_bw_run3"],
"high_iops": ["high_iops_run1", "high_iops_run2", "high_iops_run3"],
"batch_out": ["batch_out_run1", "batch_out_run2", "batch_out_run3"],
}
SCENARIO_IDS = [
"slow-connection",
"high-latency",
"packet-loss",
"congestion",
"partial-outage",
"flapping",
"cpu-stress",
"io-stress",
"mem-stress",
]
ALPHA = 0.05
PERCENTILES = [50, 75, 90, 95, 99]
# ── Plot-Stil ─────────────────────────────────────────────────────────────────
plt.rcParams.update(
{
"figure.dpi": 150,
"figure.figsize": (10, 5),
"font.family": "serif",
"font.size": 11,
"axes.titlesize": 12,
"axes.labelsize": 11,
"xtick.labelsize": 9,
"ytick.labelsize": 9,
"legend.fontsize": 9,
"axes.grid": True,
"grid.alpha": 0.3,
"axes.spines.top": False,
"axes.spines.right": False,
"savefig.bbox": "tight",
"savefig.dpi": 300,
}
)
PALETTE = {
"baseline": "#4878CF",
"pipeline": "#6ACC65",
"anomaly": "#D65F5F",
"normal": "#B8B8B8",
}
# ── Hilfsfunktionen ───────────────────────────────────────────────────────────
def load_gt(run_key: str) -> pd.DataFrame:
"""Lädt die Ground-Truth-CSV eines Laufs und parst Zeitstempel."""
gt_file, _ = RUNS[run_key]
df = pd.read_csv(ROOT / gt_file, parse_dates=["timestamp"])
df["run"] = run_key
return df
def load_anomalies(run_key: str) -> pd.DataFrame:
"""
Lädt anomalies.jsonl und parst die 'details'-Spalte in separate Spalten
pro Detektor (weight und score).
"""
_, pipeline_dir = RUNS[run_key]
path = ROOT / pipeline_dir / "anomalies.jsonl"
records = []
with open(path) as f:
for line in f:
line = line.strip()
if line:
records.append(json.loads(line))
df = pd.DataFrame(records)
df["timestamp"] = pd.to_datetime(df["timestamp"], utc=True)
df["run"] = run_key
detector_pattern = re.compile(
r"(MAD|RRCF-fast|RRCF-mid|RRCF-slow|COPOD)!?:w=([\d.]+),s=([\d.]+)"
)
def parse_details(detail_str):
result = {}
for m in detector_pattern.finditer(str(detail_str)):
name, w, s = m.group(1), float(m.group(2)), float(m.group(3))
result[f"{name}_weight"] = w
result[f"{name}_score"] = s
return pd.Series(result)
detail_cols = df["details"].apply(parse_details)
df = pd.concat([df.drop(columns=["details"]), detail_cols], axis=1)
return df
def load_baseline_metrics(run_key: str) -> pd.DataFrame:
"""Lädt baseline_metrics.csv (unabhängiger metrics-collector)."""
_, pipeline_dir = RUNS[run_key]
path = ROOT / pipeline_dir / "baseline_metrics.csv"
df = pd.read_csv(path)
df["timestamp"] = pd.to_datetime(df["timestamp"], unit="s", utc=True)
df["run"] = run_key
return df
def load_duckdb_table(run_key: str, table: str) -> pd.DataFrame:
"""
Lädt eine Tabelle aus pipeline.duckdb via DuckDB-Python-Client.
"""
import duckdb
_, pipeline_dir = RUNS[run_key]
db_path = str(ROOT / pipeline_dir / "pipeline.duckdb")
con = duckdb.connect(db_path, read_only=True)
df = con.execute(f"SELECT * FROM {table}").df()
con.close()
df["run"] = run_key
return df
def rosenthal_r(stat: float, n1: int, n2: int) -> float:
"""
Berechnet die Effektstärke r nach Rosenthal (1991).
r = Z / sqrt(N)
Da mannwhitneyu oft nur U liefert, approximieren wir Z.
"""
n = n1 + n2
mu_u = (n1 * n2) / 2
sigma_u = np.sqrt((n1 * n2 * (n1 + n2 + 1)) / 12)
if sigma_u == 0:
return 0.0
z = (stat - mu_u) / sigma_u
return abs(z / np.sqrt(n))
def wilcoxon_test(a: np.ndarray, b: np.ndarray) -> tuple[float, float]:
"""
Wilcoxon-Vorzeichen-Rang-Test für gepaarte Stichproben.
Gibt (statistik, p_wert) zurück.
Fallback auf Mann-Whitney-U wenn Längen ungleich.
"""
if len(a) == len(b):
stat, p = stats.wilcoxon(a, b, alternative="two-sided", zero_method="wilcox")
else:
stat, p = stats.mannwhitneyu(a, b, alternative="two-sided")
return float(stat), float(p)
def effect_size_label(r: float) -> str:
"""Klassifikation der Effektstärke nach Cohen (1988) für r."""
r = abs(r)
if r < 0.1:
return "vernachlässigbar"
if r < 0.3:
return "klein"
if r < 0.5:
return "mittel"
return "groß"
def save_fig(name: str):
"""Speichert die aktuelle Figure als PDF und PNG."""
for ext in ("pdf", "png"):
plt.savefig(OUTPUT_DIR / f"{name}.{ext}")
plt.close()
def print_section(title: str):
print(f"\n{'=' * 60}")
print(f" {title}")
print("=" * 60)