379 lines
12 KiB
Python
379 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
import argparse
|
|
import logging
|
|
import os
|
|
import shutil
|
|
import subprocess
|
|
import sys
|
|
import time
|
|
from pathlib import Path
|
|
|
|
CONFIG = {
|
|
"INVENTORY": Path.home()
|
|
/ "dev/privat/Go/BachelorThesis/bachelor-infra/ansible/inventory.ini",
|
|
"GENERATOR": "./scenario-generator",
|
|
"GROUND_TRUTH": "thesis_scenario_ground_truth.csv",
|
|
"OUTPUT_DIR": Path("./evaluation_results"),
|
|
"LOG_DIR": Path("./evaluation_logs"),
|
|
"CLEANUP_SCRIPT": "./cleanup_jobs.py",
|
|
"SSH_KEY": Path.home()
|
|
/ "dev/privat/Go/BachelorThesis/bachelor-infra/infrastructure/tf-cloud-init",
|
|
"VM_USER": "baUser",
|
|
"VM_HOST": "192.168.122.97",
|
|
"PIPELINE_WORKDIR": "/home/baUser/anomaly-detector-pipeline",
|
|
"PIPELINE_BIN": "/home/baUser/anomaly-detector-pipeline/anomaly-pipeline",
|
|
"PIPELINE_DB": "/home/baUser/anomaly-detector-pipeline/data/pipeline_test.duckdb",
|
|
"PIPELINE_DB_WAL": "/home/baUser/anomaly-detector-pipeline/data/pipeline_test.duckdb.wal",
|
|
"PIPELINE_ANOMALIES": "/home/baUser/anomaly-detector-pipeline/logs/anomalies.jsonl",
|
|
"PIPELINE_LOG": "/tmp/thesis_pipeline.log",
|
|
"METRICS_CSV": "/var/log/thesis_baseline_metrics.csv",
|
|
"VM_PIDFILE": "/tmp/thesis_pipeline.pid",
|
|
"WORKLOADS": ["high-bw", "high-iops", "interference", "batch-out", "batch-in"],
|
|
"RUNS_PER_WORKLOAD": 3,
|
|
"DURATIONS": {
|
|
"full_baseline_hrs": 2,
|
|
"isolated_baseline_hrs": 1.5,
|
|
"chaos_mins": 10,
|
|
"cooldown_mins": 10,
|
|
"baseline_chaos_mins": 15,
|
|
"baseline_cooldown_mins": 15,
|
|
"idle_mins": 60,
|
|
},
|
|
}
|
|
|
|
DRY_RUN = False
|
|
RESUME = False
|
|
|
|
os.makedirs(CONFIG["LOG_DIR"], exist_ok=True)
|
|
os.makedirs(CONFIG["OUTPUT_DIR"], exist_ok=True)
|
|
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format="[%(asctime)s] %(message)s",
|
|
handlers=[
|
|
logging.FileHandler(CONFIG["LOG_DIR"] / "evaluation_master.log"),
|
|
logging.StreamHandler(sys.stdout),
|
|
],
|
|
)
|
|
|
|
def is_completed(label):
|
|
if not RESUME:
|
|
return False
|
|
path = CONFIG["OUTPUT_DIR"] / f"pipeline_{label}"
|
|
if path.exists():
|
|
logging.info(f">>> [SKIP] '{label}' bereits abgeschlossen.")
|
|
return True
|
|
return False
|
|
|
|
|
|
def run_local(cmd, check=True, capture=False):
|
|
if DRY_RUN:
|
|
logging.info(f"[DRY-RUN] Local Exec: {cmd}")
|
|
return None
|
|
try:
|
|
return subprocess.run(
|
|
cmd, shell=True, check=check, capture_output=capture, text=True
|
|
)
|
|
except subprocess.CalledProcessError as e:
|
|
if check:
|
|
logging.error(f"Kritischer Fehler bei Befehl: {cmd}")
|
|
raise
|
|
return None
|
|
|
|
|
|
def vm_ssh(cmd, check=True):
|
|
ssh_cmd = (
|
|
f"ssh -i {CONFIG['SSH_KEY']} -o StrictHostKeyChecking=no "
|
|
f'{CONFIG["VM_USER"]}@{CONFIG["VM_HOST"]} "{cmd}"'
|
|
)
|
|
return run_local(ssh_cmd, check=check)
|
|
|
|
|
|
def rsync_from_vm(remote_path, local_dest):
|
|
rsync_cmd = (
|
|
f"rsync -avz -e 'ssh -i {CONFIG['SSH_KEY']} -o StrictHostKeyChecking=no' "
|
|
f"{CONFIG['VM_USER']}@{CONFIG['VM_HOST']}:{remote_path} {local_dest}"
|
|
)
|
|
if DRY_RUN:
|
|
logging.info(f"[DRY-RUN] Rsync Exec: {rsync_cmd}")
|
|
else:
|
|
run_local(rsync_cmd, check=False)
|
|
|
|
|
|
def run_cleanup_jobs():
|
|
logging.info("Starte cleanup_jobs.py...")
|
|
if DRY_RUN:
|
|
logging.info(f"[DRY-RUN] python3 {CONFIG['CLEANUP_SCRIPT']}")
|
|
else:
|
|
run_local(f"python3 {CONFIG['CLEANUP_SCRIPT']}")
|
|
|
|
|
|
def wait_for_port(port=60001, timeout=90):
|
|
if DRY_RUN:
|
|
return
|
|
logging.info(f"Warte auf Port {port}...")
|
|
start_time = time.time()
|
|
while True:
|
|
res = subprocess.run(f"ss -tlnp 2>/dev/null | grep -q ':{port}'", shell=True)
|
|
if res.returncode != 0:
|
|
break
|
|
if time.time() - start_time > timeout:
|
|
logging.error(f"Port {port} blockiert immer noch!")
|
|
sys.exit(1)
|
|
time.sleep(2)
|
|
|
|
|
|
def start_pipeline_on_vm():
|
|
logging.info("Starte Pipeline auf VM...")
|
|
|
|
cmd = (
|
|
f"cd {CONFIG['PIPELINE_WORKDIR']} && bash -c '"
|
|
f"{CONFIG['PIPELINE_BIN']} > /tmp/thesis_pipeline.log 2>&1 & disown; "
|
|
f"echo \\$! > {CONFIG['VM_PIDFILE']}'"
|
|
)
|
|
vm_ssh(cmd)
|
|
|
|
if not DRY_RUN:
|
|
logging.info("Warte 10s auf Initialisierung...")
|
|
time.sleep(10)
|
|
|
|
check_cmd = f"[ -f {CONFIG['VM_PIDFILE']} ] && kill -0 \\$(cat {CONFIG['VM_PIDFILE']}) 2>/dev/null"
|
|
result = vm_ssh(check_cmd, check=False)
|
|
|
|
if result and result.returncode == 0:
|
|
logging.info("Pipeline erfolgreich gestartet und aktiv.")
|
|
else:
|
|
logging.error("Pipeline-Start-Check fehlgeschlagen!")
|
|
log_tail = vm_ssh(f"tail -n 20 /tmp/thesis_pipeline.log", check=False)
|
|
if log_tail and log_tail.stdout:
|
|
logging.error(
|
|
f"Letzte Zeilen aus /tmp/thesis_pipeline.log auf der VM:\n{log_tail.stdout}"
|
|
)
|
|
else:
|
|
logging.error(
|
|
"Das Log-File auf der VM konnte nicht gelesen werden oder ist leer."
|
|
)
|
|
sys.exit(1)
|
|
|
|
|
|
def stop_pipeline_on_vm():
|
|
logging.info("Stoppe Pipeline auf VM...")
|
|
cmd = rf"""
|
|
if [ -f {CONFIG["VM_PIDFILE"]} ]; then
|
|
PID=\$(cat {CONFIG["VM_PIDFILE"]})
|
|
if kill -0 "\$PID" 2>/dev/null; then
|
|
kill "\$PID"
|
|
for i in \$(seq 1 30); do
|
|
kill -0 "\$PID" 2>/dev/null || break
|
|
sleep 1
|
|
done
|
|
fi
|
|
rm -f {CONFIG["VM_PIDFILE"]}
|
|
fi
|
|
"""
|
|
vm_ssh(cmd)
|
|
|
|
|
|
def toggle_metrics(state="start"):
|
|
action = "start" if state == "start" else "stop"
|
|
logging.info(f"{action.capitalize()}e Metrics-Collector...")
|
|
vm_ssh(f"sudo systemctl {action} metrics-collector.service")
|
|
|
|
|
|
def archive_and_reset(label):
|
|
dest = CONFIG["OUTPUT_DIR"] / f"pipeline_{label}"
|
|
if not DRY_RUN:
|
|
os.makedirs(dest, exist_ok=True)
|
|
rsync_from_vm(CONFIG["PIPELINE_DB"], dest / "pipeline.duckdb")
|
|
rsync_from_vm(CONFIG["PIPELINE_ANOMALIES"], dest / "anomalies.jsonl")
|
|
rsync_from_vm(CONFIG["METRICS_CSV"], dest / "baseline_metrics.csv")
|
|
rsync_from_vm(CONFIG["PIPELINE_LOG"], dest / "pipeline.log")
|
|
vm_ssh(
|
|
f"rm -f {CONFIG['PIPELINE_DB']}* {CONFIG['PIPELINE_ANOMALIES']} && sudo rm -f {CONFIG['METRICS_CSV']}"
|
|
)
|
|
|
|
|
|
def run_validation_experiment(label, generator_args):
|
|
if is_completed(label):
|
|
return
|
|
|
|
logging.info(f"\n{'=' * 60}\nRUN: {label}\n{'=' * 60}")
|
|
|
|
run_local("pkill -f '[s]cenario-generator' || true", check=False)
|
|
|
|
wait_for_port(60001)
|
|
run_cleanup_jobs()
|
|
toggle_metrics("start")
|
|
|
|
log_file = CONFIG["LOG_DIR"] / f"{label}_generator.log"
|
|
run_local(f"{CONFIG['GENERATOR']} {generator_args} 2>&1 | tee {log_file}")
|
|
|
|
toggle_metrics("stop")
|
|
archive_and_reset(label)
|
|
|
|
gt_file = Path(CONFIG["GROUND_TRUTH"])
|
|
if not DRY_RUN and gt_file.exists():
|
|
shutil.move(gt_file, CONFIG["OUTPUT_DIR"] / f"gt_{label}.csv")
|
|
|
|
|
|
def run_experiment(label, generator_args):
|
|
if is_completed(label):
|
|
return
|
|
|
|
logging.info(f"\n{'=' * 60}\nRUN: {label}\n{'=' * 60}")
|
|
|
|
run_local("pkill -f '[s]cenario-generator' || true", check=False)
|
|
|
|
wait_for_port(60001)
|
|
run_cleanup_jobs()
|
|
start_pipeline_on_vm()
|
|
toggle_metrics("start")
|
|
|
|
log_file = CONFIG["LOG_DIR"] / f"{label}_generator.log"
|
|
run_local(f"{CONFIG['GENERATOR']} {generator_args} 2>&1 | tee {log_file}")
|
|
|
|
stop_pipeline_on_vm()
|
|
toggle_metrics("stop")
|
|
archive_and_reset(label)
|
|
|
|
gt_file = Path(CONFIG["GROUND_TRUTH"])
|
|
if not DRY_RUN and gt_file.exists():
|
|
shutil.move(gt_file, CONFIG["OUTPUT_DIR"] / f"gt_{label}.csv")
|
|
|
|
|
|
def step_1_system_baseline():
|
|
if is_completed("system_baseline"):
|
|
return
|
|
logging.info("--- SCHRITT 1: System-Baseline ---")
|
|
run_cleanup_jobs()
|
|
toggle_metrics("start")
|
|
if not DRY_RUN:
|
|
time.sleep(CONFIG["DURATIONS"]["idle_mins"] * 60)
|
|
toggle_metrics("stop")
|
|
dest = CONFIG["OUTPUT_DIR"] / "pipeline_system_baseline"
|
|
if not DRY_RUN:
|
|
os.makedirs(dest, exist_ok=True)
|
|
rsync_from_vm(CONFIG["METRICS_CSV"], dest / "baseline_metrics.csv")
|
|
vm_ssh(f"sudo rm -f {CONFIG['METRICS_CSV']}")
|
|
|
|
|
|
def step_2_pipeline_baseline():
|
|
if is_completed("idle_pipeline_baseline"):
|
|
return
|
|
logging.info("--- SCHRITT 2: Pipeline-Baseline ---")
|
|
run_cleanup_jobs()
|
|
start_pipeline_on_vm()
|
|
toggle_metrics("start")
|
|
idle_hrs = CONFIG["DURATIONS"]["idle_mins"] / 60.0
|
|
gen_args = f"--inventory={CONFIG['INVENTORY']} --workload=idle --baseline-hours={idle_hrs} --run-experiment"
|
|
run_local(
|
|
f"{CONFIG['GENERATOR']} {gen_args} > {CONFIG['LOG_DIR']}/idle_baseline.log 2>&1 &"
|
|
)
|
|
if not DRY_RUN:
|
|
time.sleep(CONFIG["DURATIONS"]["idle_mins"] * 60)
|
|
run_local("pkill -f '[s]cenario-generator' || true", check=False)
|
|
stop_pipeline_on_vm()
|
|
toggle_metrics("stop")
|
|
archive_and_reset("idle_pipeline_baseline")
|
|
|
|
|
|
def step_3_validation(target_run=None):
|
|
d = CONFIG["DURATIONS"]
|
|
runs_to_do = (
|
|
[target_run] if target_run else range(1, CONFIG["RUNS_PER_WORKLOAD"] + 1)
|
|
)
|
|
for r in runs_to_do:
|
|
label = f"validation_run{r}"
|
|
args = (
|
|
f"--inventory={CONFIG['INVENTORY']} --workload=all --run-experiment "
|
|
f"--baseline-hours={d['isolated_baseline_hrs']} --chaos-mins={d['chaos_mins']} "
|
|
f"--cooldown-mins={d['cooldown_mins']}"
|
|
)
|
|
run_validation_experiment(label, args)
|
|
if not DRY_RUN and r != runs_to_do[-1]:
|
|
logging.info("Pause zwischen Runs...")
|
|
time.sleep(120)
|
|
|
|
|
|
def step_4_full_cycle(target_run=None):
|
|
d = CONFIG["DURATIONS"]
|
|
runs_to_do = (
|
|
[target_run] if target_run else range(1, CONFIG["RUNS_PER_WORKLOAD"] + 1)
|
|
)
|
|
for r in runs_to_do:
|
|
label = f"full_cycle_run{r}"
|
|
args = (
|
|
f"--inventory={CONFIG['INVENTORY']} --workload=all --run-experiment "
|
|
f"--baseline-hours={d['full_baseline_hrs']} --chaos-mins={d['baseline_chaos_mins']} "
|
|
f"--cooldown-mins={d['baseline_cooldown_mins']}"
|
|
)
|
|
run_experiment(label, args)
|
|
if not DRY_RUN and r != runs_to_do[-1]:
|
|
logging.info("Pause zwischen Runs...")
|
|
time.sleep(120)
|
|
|
|
|
|
def step_5_isolated(target_workload=None, target_run=None):
|
|
workloads = [target_workload] if target_workload else CONFIG["WORKLOADS"]
|
|
runs_to_do = (
|
|
[target_run] if target_run else range(1, CONFIG["RUNS_PER_WORKLOAD"] + 1)
|
|
)
|
|
d = CONFIG["DURATIONS"]
|
|
|
|
for wl in workloads:
|
|
for r in runs_to_do:
|
|
label = f"{wl}_run{r}"
|
|
args = (
|
|
f"--inventory={CONFIG['INVENTORY']} --workload={wl} --run-experiment "
|
|
f"--baseline-hours={d['isolated_baseline_hrs']} --chaos-mins={d['chaos_mins']} "
|
|
f"--cooldown-mins={d['cooldown_mins']}"
|
|
)
|
|
run_experiment(label, args)
|
|
if not DRY_RUN and r != runs_to_do[-1]:
|
|
logging.info("Pause zwischen Runs...")
|
|
time.sleep(120)
|
|
|
|
|
|
def main():
|
|
global DRY_RUN, RESUME
|
|
parser = argparse.ArgumentParser(description="Evaluation Automator")
|
|
parser.add_argument(
|
|
"--step", choices=["1", "2", "3", "4", "5", "all"], default="all"
|
|
)
|
|
parser.add_argument("--workload", choices=CONFIG["WORKLOADS"])
|
|
parser.add_argument("--run", type=int)
|
|
parser.add_argument("--dry-run", action="store_true")
|
|
parser.add_argument("--resume", action="store_true")
|
|
|
|
args = parser.parse_args()
|
|
DRY_RUN = args.dry_run
|
|
RESUME = args.resume
|
|
|
|
if DRY_RUN:
|
|
logging.info("!!! DRY-RUN MODUS !!!")
|
|
if RESUME:
|
|
logging.info(">>> RESUME MODUS AKTIVIERT.")
|
|
|
|
try:
|
|
if args.step in ["1", "all"]:
|
|
step_1_system_baseline()
|
|
if args.step in ["2", "all"]:
|
|
step_2_pipeline_baseline()
|
|
if args.step in ["3", "all"]:
|
|
step_3_validation(args.run)
|
|
if args.step in ["4", "all"]:
|
|
step_4_full_cycle(args.run)
|
|
if args.step in ["5", "all"]:
|
|
step_5_isolated(args.workload, args.run)
|
|
logging.info("\nEVALUATION ABGESCHLOSSEN.")
|
|
except KeyboardInterrupt:
|
|
logging.warning("\nAbbruch! Cleanup...")
|
|
if not DRY_RUN:
|
|
stop_pipeline_on_vm()
|
|
toggle_metrics("stop")
|
|
subprocess.run("pkill -f '[s]cenario-generator'", shell=True)
|
|
sys.exit(1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|