bachelor-thesis/scenario_generator/run_evaluation.py

379 lines
12 KiB
Python

#!/usr/bin/env python3
import argparse
import logging
import os
import shutil
import subprocess
import sys
import time
from pathlib import Path
CONFIG = {
"INVENTORY": Path.home()
/ "dev/privat/Go/BachelorThesis/bachelor-infra/ansible/inventory.ini",
"GENERATOR": "./scenario-generator",
"GROUND_TRUTH": "thesis_scenario_ground_truth.csv",
"OUTPUT_DIR": Path("./evaluation_results"),
"LOG_DIR": Path("./evaluation_logs"),
"CLEANUP_SCRIPT": "./cleanup_jobs.py",
"SSH_KEY": Path.home()
/ "dev/privat/Go/BachelorThesis/bachelor-infra/infrastructure/tf-cloud-init",
"VM_USER": "baUser",
"VM_HOST": "192.168.122.97",
"PIPELINE_WORKDIR": "/home/baUser/anomaly-detector-pipeline",
"PIPELINE_BIN": "/home/baUser/anomaly-detector-pipeline/anomaly-pipeline",
"PIPELINE_DB": "/home/baUser/anomaly-detector-pipeline/data/pipeline_test.duckdb",
"PIPELINE_DB_WAL": "/home/baUser/anomaly-detector-pipeline/data/pipeline_test.duckdb.wal",
"PIPELINE_ANOMALIES": "/home/baUser/anomaly-detector-pipeline/logs/anomalies.jsonl",
"PIPELINE_LOG": "/tmp/thesis_pipeline.log",
"METRICS_CSV": "/var/log/thesis_baseline_metrics.csv",
"VM_PIDFILE": "/tmp/thesis_pipeline.pid",
"WORKLOADS": ["high-bw", "high-iops", "interference", "batch-out", "batch-in"],
"RUNS_PER_WORKLOAD": 3,
"DURATIONS": {
"full_baseline_hrs": 2,
"isolated_baseline_hrs": 1.5,
"chaos_mins": 10,
"cooldown_mins": 10,
"baseline_chaos_mins": 15,
"baseline_cooldown_mins": 15,
"idle_mins": 60,
},
}
DRY_RUN = False
RESUME = False
os.makedirs(CONFIG["LOG_DIR"], exist_ok=True)
os.makedirs(CONFIG["OUTPUT_DIR"], exist_ok=True)
logging.basicConfig(
level=logging.INFO,
format="[%(asctime)s] %(message)s",
handlers=[
logging.FileHandler(CONFIG["LOG_DIR"] / "evaluation_master.log"),
logging.StreamHandler(sys.stdout),
],
)
def is_completed(label):
if not RESUME:
return False
path = CONFIG["OUTPUT_DIR"] / f"pipeline_{label}"
if path.exists():
logging.info(f">>> [SKIP] '{label}' bereits abgeschlossen.")
return True
return False
def run_local(cmd, check=True, capture=False):
if DRY_RUN:
logging.info(f"[DRY-RUN] Local Exec: {cmd}")
return None
try:
return subprocess.run(
cmd, shell=True, check=check, capture_output=capture, text=True
)
except subprocess.CalledProcessError as e:
if check:
logging.error(f"Kritischer Fehler bei Befehl: {cmd}")
raise
return None
def vm_ssh(cmd, check=True):
ssh_cmd = (
f"ssh -i {CONFIG['SSH_KEY']} -o StrictHostKeyChecking=no "
f'{CONFIG["VM_USER"]}@{CONFIG["VM_HOST"]} "{cmd}"'
)
return run_local(ssh_cmd, check=check)
def rsync_from_vm(remote_path, local_dest):
rsync_cmd = (
f"rsync -avz -e 'ssh -i {CONFIG['SSH_KEY']} -o StrictHostKeyChecking=no' "
f"{CONFIG['VM_USER']}@{CONFIG['VM_HOST']}:{remote_path} {local_dest}"
)
if DRY_RUN:
logging.info(f"[DRY-RUN] Rsync Exec: {rsync_cmd}")
else:
run_local(rsync_cmd, check=False)
def run_cleanup_jobs():
logging.info("Starte cleanup_jobs.py...")
if DRY_RUN:
logging.info(f"[DRY-RUN] python3 {CONFIG['CLEANUP_SCRIPT']}")
else:
run_local(f"python3 {CONFIG['CLEANUP_SCRIPT']}")
def wait_for_port(port=60001, timeout=90):
if DRY_RUN:
return
logging.info(f"Warte auf Port {port}...")
start_time = time.time()
while True:
res = subprocess.run(f"ss -tlnp 2>/dev/null | grep -q ':{port}'", shell=True)
if res.returncode != 0:
break
if time.time() - start_time > timeout:
logging.error(f"Port {port} blockiert immer noch!")
sys.exit(1)
time.sleep(2)
def start_pipeline_on_vm():
logging.info("Starte Pipeline auf VM...")
cmd = (
f"cd {CONFIG['PIPELINE_WORKDIR']} && bash -c '"
f"{CONFIG['PIPELINE_BIN']} > /tmp/thesis_pipeline.log 2>&1 & disown; "
f"echo \\$! > {CONFIG['VM_PIDFILE']}'"
)
vm_ssh(cmd)
if not DRY_RUN:
logging.info("Warte 10s auf Initialisierung...")
time.sleep(10)
check_cmd = f"[ -f {CONFIG['VM_PIDFILE']} ] && kill -0 \\$(cat {CONFIG['VM_PIDFILE']}) 2>/dev/null"
result = vm_ssh(check_cmd, check=False)
if result and result.returncode == 0:
logging.info("Pipeline erfolgreich gestartet und aktiv.")
else:
logging.error("Pipeline-Start-Check fehlgeschlagen!")
log_tail = vm_ssh(f"tail -n 20 /tmp/thesis_pipeline.log", check=False)
if log_tail and log_tail.stdout:
logging.error(
f"Letzte Zeilen aus /tmp/thesis_pipeline.log auf der VM:\n{log_tail.stdout}"
)
else:
logging.error(
"Das Log-File auf der VM konnte nicht gelesen werden oder ist leer."
)
sys.exit(1)
def stop_pipeline_on_vm():
logging.info("Stoppe Pipeline auf VM...")
cmd = rf"""
if [ -f {CONFIG["VM_PIDFILE"]} ]; then
PID=\$(cat {CONFIG["VM_PIDFILE"]})
if kill -0 "\$PID" 2>/dev/null; then
kill "\$PID"
for i in \$(seq 1 30); do
kill -0 "\$PID" 2>/dev/null || break
sleep 1
done
fi
rm -f {CONFIG["VM_PIDFILE"]}
fi
"""
vm_ssh(cmd)
def toggle_metrics(state="start"):
action = "start" if state == "start" else "stop"
logging.info(f"{action.capitalize()}e Metrics-Collector...")
vm_ssh(f"sudo systemctl {action} metrics-collector.service")
def archive_and_reset(label):
dest = CONFIG["OUTPUT_DIR"] / f"pipeline_{label}"
if not DRY_RUN:
os.makedirs(dest, exist_ok=True)
rsync_from_vm(CONFIG["PIPELINE_DB"], dest / "pipeline.duckdb")
rsync_from_vm(CONFIG["PIPELINE_ANOMALIES"], dest / "anomalies.jsonl")
rsync_from_vm(CONFIG["METRICS_CSV"], dest / "baseline_metrics.csv")
rsync_from_vm(CONFIG["PIPELINE_LOG"], dest / "pipeline.log")
vm_ssh(
f"rm -f {CONFIG['PIPELINE_DB']}* {CONFIG['PIPELINE_ANOMALIES']} && sudo rm -f {CONFIG['METRICS_CSV']}"
)
def run_validation_experiment(label, generator_args):
if is_completed(label):
return
logging.info(f"\n{'=' * 60}\nRUN: {label}\n{'=' * 60}")
run_local("pkill -f '[s]cenario-generator' || true", check=False)
wait_for_port(60001)
run_cleanup_jobs()
toggle_metrics("start")
log_file = CONFIG["LOG_DIR"] / f"{label}_generator.log"
run_local(f"{CONFIG['GENERATOR']} {generator_args} 2>&1 | tee {log_file}")
toggle_metrics("stop")
archive_and_reset(label)
gt_file = Path(CONFIG["GROUND_TRUTH"])
if not DRY_RUN and gt_file.exists():
shutil.move(gt_file, CONFIG["OUTPUT_DIR"] / f"gt_{label}.csv")
def run_experiment(label, generator_args):
if is_completed(label):
return
logging.info(f"\n{'=' * 60}\nRUN: {label}\n{'=' * 60}")
run_local("pkill -f '[s]cenario-generator' || true", check=False)
wait_for_port(60001)
run_cleanup_jobs()
start_pipeline_on_vm()
toggle_metrics("start")
log_file = CONFIG["LOG_DIR"] / f"{label}_generator.log"
run_local(f"{CONFIG['GENERATOR']} {generator_args} 2>&1 | tee {log_file}")
stop_pipeline_on_vm()
toggle_metrics("stop")
archive_and_reset(label)
gt_file = Path(CONFIG["GROUND_TRUTH"])
if not DRY_RUN and gt_file.exists():
shutil.move(gt_file, CONFIG["OUTPUT_DIR"] / f"gt_{label}.csv")
def step_1_system_baseline():
if is_completed("system_baseline"):
return
logging.info("--- SCHRITT 1: System-Baseline ---")
run_cleanup_jobs()
toggle_metrics("start")
if not DRY_RUN:
time.sleep(CONFIG["DURATIONS"]["idle_mins"] * 60)
toggle_metrics("stop")
dest = CONFIG["OUTPUT_DIR"] / "pipeline_system_baseline"
if not DRY_RUN:
os.makedirs(dest, exist_ok=True)
rsync_from_vm(CONFIG["METRICS_CSV"], dest / "baseline_metrics.csv")
vm_ssh(f"sudo rm -f {CONFIG['METRICS_CSV']}")
def step_2_pipeline_baseline():
if is_completed("idle_pipeline_baseline"):
return
logging.info("--- SCHRITT 2: Pipeline-Baseline ---")
run_cleanup_jobs()
start_pipeline_on_vm()
toggle_metrics("start")
idle_hrs = CONFIG["DURATIONS"]["idle_mins"] / 60.0
gen_args = f"--inventory={CONFIG['INVENTORY']} --workload=idle --baseline-hours={idle_hrs} --run-experiment"
run_local(
f"{CONFIG['GENERATOR']} {gen_args} > {CONFIG['LOG_DIR']}/idle_baseline.log 2>&1 &"
)
if not DRY_RUN:
time.sleep(CONFIG["DURATIONS"]["idle_mins"] * 60)
run_local("pkill -f '[s]cenario-generator' || true", check=False)
stop_pipeline_on_vm()
toggle_metrics("stop")
archive_and_reset("idle_pipeline_baseline")
def step_3_validation(target_run=None):
d = CONFIG["DURATIONS"]
runs_to_do = (
[target_run] if target_run else range(1, CONFIG["RUNS_PER_WORKLOAD"] + 1)
)
for r in runs_to_do:
label = f"validation_run{r}"
args = (
f"--inventory={CONFIG['INVENTORY']} --workload=all --run-experiment "
f"--baseline-hours={d['isolated_baseline_hrs']} --chaos-mins={d['chaos_mins']} "
f"--cooldown-mins={d['cooldown_mins']}"
)
run_validation_experiment(label, args)
if not DRY_RUN and r != runs_to_do[-1]:
logging.info("Pause zwischen Runs...")
time.sleep(120)
def step_4_full_cycle(target_run=None):
d = CONFIG["DURATIONS"]
runs_to_do = (
[target_run] if target_run else range(1, CONFIG["RUNS_PER_WORKLOAD"] + 1)
)
for r in runs_to_do:
label = f"full_cycle_run{r}"
args = (
f"--inventory={CONFIG['INVENTORY']} --workload=all --run-experiment "
f"--baseline-hours={d['full_baseline_hrs']} --chaos-mins={d['baseline_chaos_mins']} "
f"--cooldown-mins={d['baseline_cooldown_mins']}"
)
run_experiment(label, args)
if not DRY_RUN and r != runs_to_do[-1]:
logging.info("Pause zwischen Runs...")
time.sleep(120)
def step_5_isolated(target_workload=None, target_run=None):
workloads = [target_workload] if target_workload else CONFIG["WORKLOADS"]
runs_to_do = (
[target_run] if target_run else range(1, CONFIG["RUNS_PER_WORKLOAD"] + 1)
)
d = CONFIG["DURATIONS"]
for wl in workloads:
for r in runs_to_do:
label = f"{wl}_run{r}"
args = (
f"--inventory={CONFIG['INVENTORY']} --workload={wl} --run-experiment "
f"--baseline-hours={d['isolated_baseline_hrs']} --chaos-mins={d['chaos_mins']} "
f"--cooldown-mins={d['cooldown_mins']}"
)
run_experiment(label, args)
if not DRY_RUN and r != runs_to_do[-1]:
logging.info("Pause zwischen Runs...")
time.sleep(120)
def main():
global DRY_RUN, RESUME
parser = argparse.ArgumentParser(description="Evaluation Automator")
parser.add_argument(
"--step", choices=["1", "2", "3", "4", "5", "all"], default="all"
)
parser.add_argument("--workload", choices=CONFIG["WORKLOADS"])
parser.add_argument("--run", type=int)
parser.add_argument("--dry-run", action="store_true")
parser.add_argument("--resume", action="store_true")
args = parser.parse_args()
DRY_RUN = args.dry_run
RESUME = args.resume
if DRY_RUN:
logging.info("!!! DRY-RUN MODUS !!!")
if RESUME:
logging.info(">>> RESUME MODUS AKTIVIERT.")
try:
if args.step in ["1", "all"]:
step_1_system_baseline()
if args.step in ["2", "all"]:
step_2_pipeline_baseline()
if args.step in ["3", "all"]:
step_3_validation(args.run)
if args.step in ["4", "all"]:
step_4_full_cycle(args.run)
if args.step in ["5", "all"]:
step_5_isolated(args.workload, args.run)
logging.info("\nEVALUATION ABGESCHLOSSEN.")
except KeyboardInterrupt:
logging.warning("\nAbbruch! Cleanup...")
if not DRY_RUN:
stop_pipeline_on_vm()
toggle_metrics("stop")
subprocess.run("pkill -f '[s]cenario-generator'", shell=True)
sys.exit(1)
if __name__ == "__main__":
main()