#!/usr/bin/env python3 import argparse import logging import os import shutil import subprocess import sys import time from pathlib import Path CONFIG = { "INVENTORY": Path.home() / "dev/privat/Go/BachelorThesis/bachelor-infra/ansible/inventory.ini", "GENERATOR": "./scenario-generator", "GROUND_TRUTH": "thesis_scenario_ground_truth.csv", "OUTPUT_DIR": Path("./evaluation_results"), "LOG_DIR": Path("./evaluation_logs"), "CLEANUP_SCRIPT": "./cleanup_jobs.py", "SSH_KEY": Path.home() / "dev/privat/Go/BachelorThesis/bachelor-infra/infrastructure/tf-cloud-init", "VM_USER": "baUser", "VM_HOST": "192.168.122.97", "PIPELINE_WORKDIR": "/home/baUser/anomaly-detector-pipeline", "PIPELINE_BIN": "/home/baUser/anomaly-detector-pipeline/anomaly-pipeline", "PIPELINE_DB": "/home/baUser/anomaly-detector-pipeline/data/pipeline_test.duckdb", "PIPELINE_DB_WAL": "/home/baUser/anomaly-detector-pipeline/data/pipeline_test.duckdb.wal", "PIPELINE_ANOMALIES": "/home/baUser/anomaly-detector-pipeline/logs/anomalies.jsonl", "PIPELINE_LOG": "/tmp/thesis_pipeline.log", "METRICS_CSV": "/var/log/thesis_baseline_metrics.csv", "VM_PIDFILE": "/tmp/thesis_pipeline.pid", "WORKLOADS": ["high-bw", "high-iops", "interference", "batch-out", "batch-in"], "RUNS_PER_WORKLOAD": 3, "DURATIONS": { "full_baseline_hrs": 2, "isolated_baseline_hrs": 1.5, "chaos_mins": 10, "cooldown_mins": 10, "baseline_chaos_mins": 15, "baseline_cooldown_mins": 15, "idle_mins": 60, }, } DRY_RUN = False RESUME = False os.makedirs(CONFIG["LOG_DIR"], exist_ok=True) os.makedirs(CONFIG["OUTPUT_DIR"], exist_ok=True) logging.basicConfig( level=logging.INFO, format="[%(asctime)s] %(message)s", handlers=[ logging.FileHandler(CONFIG["LOG_DIR"] / "evaluation_master.log"), logging.StreamHandler(sys.stdout), ], ) def is_completed(label): if not RESUME: return False path = CONFIG["OUTPUT_DIR"] / f"pipeline_{label}" if path.exists(): logging.info(f">>> [SKIP] '{label}' bereits abgeschlossen.") return True return False def run_local(cmd, check=True, capture=False): if DRY_RUN: logging.info(f"[DRY-RUN] Local Exec: {cmd}") return None try: return subprocess.run( cmd, shell=True, check=check, capture_output=capture, text=True ) except subprocess.CalledProcessError as e: if check: logging.error(f"Kritischer Fehler bei Befehl: {cmd}") raise return None def vm_ssh(cmd, check=True): ssh_cmd = ( f"ssh -i {CONFIG['SSH_KEY']} -o StrictHostKeyChecking=no " f'{CONFIG["VM_USER"]}@{CONFIG["VM_HOST"]} "{cmd}"' ) return run_local(ssh_cmd, check=check) def rsync_from_vm(remote_path, local_dest): rsync_cmd = ( f"rsync -avz -e 'ssh -i {CONFIG['SSH_KEY']} -o StrictHostKeyChecking=no' " f"{CONFIG['VM_USER']}@{CONFIG['VM_HOST']}:{remote_path} {local_dest}" ) if DRY_RUN: logging.info(f"[DRY-RUN] Rsync Exec: {rsync_cmd}") else: run_local(rsync_cmd, check=False) def run_cleanup_jobs(): logging.info("Starte cleanup_jobs.py...") if DRY_RUN: logging.info(f"[DRY-RUN] python3 {CONFIG['CLEANUP_SCRIPT']}") else: run_local(f"python3 {CONFIG['CLEANUP_SCRIPT']}") def wait_for_port(port=60001, timeout=90): if DRY_RUN: return logging.info(f"Warte auf Port {port}...") start_time = time.time() while True: res = subprocess.run(f"ss -tlnp 2>/dev/null | grep -q ':{port}'", shell=True) if res.returncode != 0: break if time.time() - start_time > timeout: logging.error(f"Port {port} blockiert immer noch!") sys.exit(1) time.sleep(2) def start_pipeline_on_vm(): logging.info("Starte Pipeline auf VM...") cmd = ( f"cd {CONFIG['PIPELINE_WORKDIR']} && bash -c '" f"{CONFIG['PIPELINE_BIN']} > /tmp/thesis_pipeline.log 2>&1 & disown; " f"echo \\$! > {CONFIG['VM_PIDFILE']}'" ) vm_ssh(cmd) if not DRY_RUN: logging.info("Warte 10s auf Initialisierung...") time.sleep(10) check_cmd = f"[ -f {CONFIG['VM_PIDFILE']} ] && kill -0 \\$(cat {CONFIG['VM_PIDFILE']}) 2>/dev/null" result = vm_ssh(check_cmd, check=False) if result and result.returncode == 0: logging.info("Pipeline erfolgreich gestartet und aktiv.") else: logging.error("Pipeline-Start-Check fehlgeschlagen!") log_tail = vm_ssh(f"tail -n 20 /tmp/thesis_pipeline.log", check=False) if log_tail and log_tail.stdout: logging.error( f"Letzte Zeilen aus /tmp/thesis_pipeline.log auf der VM:\n{log_tail.stdout}" ) else: logging.error( "Das Log-File auf der VM konnte nicht gelesen werden oder ist leer." ) sys.exit(1) def stop_pipeline_on_vm(): logging.info("Stoppe Pipeline auf VM...") cmd = rf""" if [ -f {CONFIG["VM_PIDFILE"]} ]; then PID=\$(cat {CONFIG["VM_PIDFILE"]}) if kill -0 "\$PID" 2>/dev/null; then kill "\$PID" for i in \$(seq 1 30); do kill -0 "\$PID" 2>/dev/null || break sleep 1 done fi rm -f {CONFIG["VM_PIDFILE"]} fi """ vm_ssh(cmd) def toggle_metrics(state="start"): action = "start" if state == "start" else "stop" logging.info(f"{action.capitalize()}e Metrics-Collector...") vm_ssh(f"sudo systemctl {action} metrics-collector.service") def archive_and_reset(label): dest = CONFIG["OUTPUT_DIR"] / f"pipeline_{label}" if not DRY_RUN: os.makedirs(dest, exist_ok=True) rsync_from_vm(CONFIG["PIPELINE_DB"], dest / "pipeline.duckdb") rsync_from_vm(CONFIG["PIPELINE_ANOMALIES"], dest / "anomalies.jsonl") rsync_from_vm(CONFIG["METRICS_CSV"], dest / "baseline_metrics.csv") rsync_from_vm(CONFIG["PIPELINE_LOG"], dest / "pipeline.log") vm_ssh( f"rm -f {CONFIG['PIPELINE_DB']}* {CONFIG['PIPELINE_ANOMALIES']} && sudo rm -f {CONFIG['METRICS_CSV']}" ) def run_validation_experiment(label, generator_args): if is_completed(label): return logging.info(f"\n{'=' * 60}\nRUN: {label}\n{'=' * 60}") run_local("pkill -f '[s]cenario-generator' || true", check=False) wait_for_port(60001) run_cleanup_jobs() toggle_metrics("start") log_file = CONFIG["LOG_DIR"] / f"{label}_generator.log" run_local(f"{CONFIG['GENERATOR']} {generator_args} 2>&1 | tee {log_file}") toggle_metrics("stop") archive_and_reset(label) gt_file = Path(CONFIG["GROUND_TRUTH"]) if not DRY_RUN and gt_file.exists(): shutil.move(gt_file, CONFIG["OUTPUT_DIR"] / f"gt_{label}.csv") def run_experiment(label, generator_args): if is_completed(label): return logging.info(f"\n{'=' * 60}\nRUN: {label}\n{'=' * 60}") run_local("pkill -f '[s]cenario-generator' || true", check=False) wait_for_port(60001) run_cleanup_jobs() start_pipeline_on_vm() toggle_metrics("start") log_file = CONFIG["LOG_DIR"] / f"{label}_generator.log" run_local(f"{CONFIG['GENERATOR']} {generator_args} 2>&1 | tee {log_file}") stop_pipeline_on_vm() toggle_metrics("stop") archive_and_reset(label) gt_file = Path(CONFIG["GROUND_TRUTH"]) if not DRY_RUN and gt_file.exists(): shutil.move(gt_file, CONFIG["OUTPUT_DIR"] / f"gt_{label}.csv") def step_1_system_baseline(): if is_completed("system_baseline"): return logging.info("--- SCHRITT 1: System-Baseline ---") run_cleanup_jobs() toggle_metrics("start") if not DRY_RUN: time.sleep(CONFIG["DURATIONS"]["idle_mins"] * 60) toggle_metrics("stop") dest = CONFIG["OUTPUT_DIR"] / "pipeline_system_baseline" if not DRY_RUN: os.makedirs(dest, exist_ok=True) rsync_from_vm(CONFIG["METRICS_CSV"], dest / "baseline_metrics.csv") vm_ssh(f"sudo rm -f {CONFIG['METRICS_CSV']}") def step_2_pipeline_baseline(): if is_completed("idle_pipeline_baseline"): return logging.info("--- SCHRITT 2: Pipeline-Baseline ---") run_cleanup_jobs() start_pipeline_on_vm() toggle_metrics("start") idle_hrs = CONFIG["DURATIONS"]["idle_mins"] / 60.0 gen_args = f"--inventory={CONFIG['INVENTORY']} --workload=idle --baseline-hours={idle_hrs} --run-experiment" run_local( f"{CONFIG['GENERATOR']} {gen_args} > {CONFIG['LOG_DIR']}/idle_baseline.log 2>&1 &" ) if not DRY_RUN: time.sleep(CONFIG["DURATIONS"]["idle_mins"] * 60) run_local("pkill -f '[s]cenario-generator' || true", check=False) stop_pipeline_on_vm() toggle_metrics("stop") archive_and_reset("idle_pipeline_baseline") def step_3_validation(target_run=None): d = CONFIG["DURATIONS"] runs_to_do = ( [target_run] if target_run else range(1, CONFIG["RUNS_PER_WORKLOAD"] + 1) ) for r in runs_to_do: label = f"validation_run{r}" args = ( f"--inventory={CONFIG['INVENTORY']} --workload=all --run-experiment " f"--baseline-hours={d['isolated_baseline_hrs']} --chaos-mins={d['chaos_mins']} " f"--cooldown-mins={d['cooldown_mins']}" ) run_validation_experiment(label, args) if not DRY_RUN and r != runs_to_do[-1]: logging.info("Pause zwischen Runs...") time.sleep(120) def step_4_full_cycle(target_run=None): d = CONFIG["DURATIONS"] runs_to_do = ( [target_run] if target_run else range(1, CONFIG["RUNS_PER_WORKLOAD"] + 1) ) for r in runs_to_do: label = f"full_cycle_run{r}" args = ( f"--inventory={CONFIG['INVENTORY']} --workload=all --run-experiment " f"--baseline-hours={d['full_baseline_hrs']} --chaos-mins={d['baseline_chaos_mins']} " f"--cooldown-mins={d['baseline_cooldown_mins']}" ) run_experiment(label, args) if not DRY_RUN and r != runs_to_do[-1]: logging.info("Pause zwischen Runs...") time.sleep(120) def step_5_isolated(target_workload=None, target_run=None): workloads = [target_workload] if target_workload else CONFIG["WORKLOADS"] runs_to_do = ( [target_run] if target_run else range(1, CONFIG["RUNS_PER_WORKLOAD"] + 1) ) d = CONFIG["DURATIONS"] for wl in workloads: for r in runs_to_do: label = f"{wl}_run{r}" args = ( f"--inventory={CONFIG['INVENTORY']} --workload={wl} --run-experiment " f"--baseline-hours={d['isolated_baseline_hrs']} --chaos-mins={d['chaos_mins']} " f"--cooldown-mins={d['cooldown_mins']}" ) run_experiment(label, args) if not DRY_RUN and r != runs_to_do[-1]: logging.info("Pause zwischen Runs...") time.sleep(120) def main(): global DRY_RUN, RESUME parser = argparse.ArgumentParser(description="Evaluation Automator") parser.add_argument( "--step", choices=["1", "2", "3", "4", "5", "all"], default="all" ) parser.add_argument("--workload", choices=CONFIG["WORKLOADS"]) parser.add_argument("--run", type=int) parser.add_argument("--dry-run", action="store_true") parser.add_argument("--resume", action="store_true") args = parser.parse_args() DRY_RUN = args.dry_run RESUME = args.resume if DRY_RUN: logging.info("!!! DRY-RUN MODUS !!!") if RESUME: logging.info(">>> RESUME MODUS AKTIVIERT.") try: if args.step in ["1", "all"]: step_1_system_baseline() if args.step in ["2", "all"]: step_2_pipeline_baseline() if args.step in ["3", "all"]: step_3_validation(args.run) if args.step in ["4", "all"]: step_4_full_cycle(args.run) if args.step in ["5", "all"]: step_5_isolated(args.workload, args.run) logging.info("\nEVALUATION ABGESCHLOSSEN.") except KeyboardInterrupt: logging.warning("\nAbbruch! Cleanup...") if not DRY_RUN: stop_pipeline_on_vm() toggle_metrics("stop") subprocess.run("pkill -f '[s]cenario-generator'", shell=True) sys.exit(1) if __name__ == "__main__": main()