add Final Infrastructure Setup

This commit is contained in:
Patryk Hegenberg 2026-03-29 13:45:10 +02:00
commit 7733dde658
174 changed files with 204949 additions and 0 deletions

View file

@ -0,0 +1,9 @@
#!/bin/bash
TARGET="/local_testdata/test-destination"
echo "Starte Blackhole auf $TARGET"
while true; do
find "$TARGET" -type f -mmin +0.5 -delete
sleep 10
done

View file

@ -0,0 +1,111 @@
export:
enabled: true
batch_size: 100
export_interval: "30s"
retry_attempts: 5
retry_backoff: "10s"
health_check_interval: "60s"
localstorage:
enabled: true
db_path: "./watch.db"
rotation:
max_sizes_bytes: 100 * 1024 * 1024
max_age_hours: 24
max_files: 3
check_interval_minuntes: 5
archive_dir: ""
elasticsearch:
enabled: false
url: "http://10.0.0.99:9200"
index: "watch"
username: "your-configured-user"
password: "your-super-secret-password"
api_key: "your-api-key"
timeout: 30
web_service:
enabled: true
host: "0.0.0.0"
port: 9090
system_metrics:
enabled: true
collect_cpu: true
collect_memory: true
collect_disk: true
collect_network: true
disk_paths:
- "/"
- "/var"
- "/home"
network_interfaces:
- "ens6"
collect_network_connections: true
collect_load_average: true
collect_tcp_stats: true
collect_filehandles: true
collect_disk_io: true
collect_network_latency: true
collect_bandwidth_usage: true
transfer_ports: 60003
latency_test_hosts: "www.google.de"
poll_interval_seconds: 30
patterns_file: "./configs/patterns.yaml"
logging:
level: "info"
file_path: "/var/log/system-monitor.log"
drain3:
enabled: true
state_dir: "./drain3_states"
depth: 4
sim_th: 0.4
max_children: 100
max_clusters: 1000
save_interval: 60
services:
- name: "nginx"
service: "nginx.service"
enabled: true
since_time: ""
priority: "info"
tools:
- name: "nginx-access"
log_file: "/var/log/nginx/access.log"
enabled: true
buffer_size: 200
format:
name: "nginx_combined"
pattern: '^(?P<client_ip>\S+) \S+ \S+ \[(?P<timestamp>[^\]]+)\] "(?P<method>\S+) (?P<path>\S+) (?P<protocol>\S+)" (?P<status>\d+) (?P<body_bytes>\d+) "(?P<referer>[^"]*)" "(?P<user_agent>[^"]*)"'
fields:
client_ip: "remote_addr"
timestamp: "time_local"
method: "request_method"
path: "request_uri"
protocol: "server_protocol"
status: "status"
body_bytes: "body_bytes_sent"
referer: "http_referer"
user_agent: "http_user_agent"
- name: "nginx-error"
log_file: "/var/log/nginx/error.log"
enabled: true
buffer_size: 100
format:
name: "nginx_error"
pattern: '^(?P<timestamp>\d{4}/\d{2}/\d{2} \d{2}:\d{2}:\d{2}) \[(?P<level>\w+)\] (?P<pid>\d+)#(?P<tid>\d+): (?P<message>.*)'
fields:
timestamp: "time"
level: "log_level"
pid: "process_id"
tid: "thread_id"
message: "error_message"

View file

@ -0,0 +1,15 @@
#!/bin/bash
# Verzeichnis, in dem TIXstream die Dateien ablegt (anpassen!)
INCOMING_DIR="/data/tixel/incoming"
echo "Starte Blackhole-Service für $INCOMING_DIR..."
while true; do
# Lösche alle Dateien, die älter als 1 Minute sind (damit wir laufende Transfers nicht killen)
# Oder radikaler: Sobald der Transfer fertig ist (Tixstream benennt temporäre Dateien oft um).
# Wir nehmen hier sicherheitshalber Dateien, auf die seit 10s nicht zugegriffen wurde.
find "$INCOMING_DIR" -type f -mmin +0.1 -delete
sleep 10
done

View file

@ -0,0 +1,11 @@
#!/bin/bash
# Speichert Logs der relevanten Services (TJM und Tixstream Engine)
# -f: Follow (Live)
# -u: Unit filter
# --no-pager: Wichtig für Background-Prozesse
# -o json: Speichert strukturiertes JSON (perfekt für deine ML-Pipeline!)
LOGfile="/var/log/thesis_tixstream_logs.json"
# Wir loggen Tixstream Engine UND den Job Manager
exec journalctl -u tixstream -u transfer-job-manager -f -o json --no-pager >> "$LOGfile"

View file

@ -0,0 +1,165 @@
patterns:
# ===========================================================================
# Common / Shared Patterns
# ===========================================================================
common:
extractors:
- name: "syslog_header"
regex: '^(\w{3} \d{2} \d{2}:\d{2}:\d{2}) (?P<hostname>[^\s]+) (?P<process_info>[^:]+):\s*(?P<message_rest>.*)$'
fields:
syslog_timestamp: "time:Jan 02 15:04:05"
hostname: "string"
process_info: "string"
message_rest: "string"
- name: "timestamp_rfc3339"
regex: '(?P<timestamp>\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:\.\d+)?Z?)'
fields:
timestamp: "time:2006-01-02T15:04:05.000000Z"
# ===========================================================================
# TIXstream Service
# Deckt ab: tsServicePattern, tsTransferIDPattern, tsDetailPattern1-4
# ===========================================================================
tixstream:
extractors:
- name: "service_log_base"
regex: '^(?P<log_level>\S+)\s+(?P<timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d{6})\s+(?P<message>.*)'
fields:
log_level: "string"
timestamp: "time:2006-01-02 15:04:05.000000"
message: "string"
- name: "transfer_id_extraction"
regex: '^(?P<transfer_id>\w{8}-\w{4}-\w{4}-\w{4}-\w{12})\s+(?P<message>.*)'
fields:
transfer_id: "string"
message: "string"
- name: "transfer_start_in"
regex: 'in: Transfer start (?P<thread_info>\d+/\d+) buffers=(?P<buffers>\d+) files=(?P<file_count>\d+) size=(?P<size_mb>[0-9.]+) MByte chunksize=(?P<chunk_size>\d+) streams=(?P<streams>\d+) target-datarate=(?P<target_rate>[0-9.]+) MByte/s protocol=(?P<protocol>\w+) dest=(?P<destination>\S+) sender-id=(?P<sender_id>\S+)'
fields:
thread_info: "string" # z.B. "1/4" - Typisierung hier schwierig, also String
buffers: "int"
file_count: "int"
size_mb: "float"
chunk_size: "int"
streams: "int"
target_rate: "float"
protocol: "string"
destination: "string"
sender_id: "string"
direction: "string" # Wir können statische Felder im Parser injecten oder hier als "implizit" betrachten
- name: "transfer_start_remote_out"
regex: 'out: Start remote transfer to (?P<target>[^\s]+) request executed, duration=(?P<duration>[0-9.]+) s'
fields:
target: "string"
duration: "float"
- name: "transfer_start_out"
regex: 'out: Transfer start (?P<thread_info>\d+/\d+) buffers=(?P<buffers>\d+) files=(?P<file_count>\d+) size=(?P<size_mb>[0-9.]+) MByte chunksize=(?P<chunk_size>\d+) streams=(?P<streams>\d+) target-datarate=(?P<target_rate>[0-9.]+) MByte/s protocol=(?P<protocol>\w+) src=(?P<source>\S+) receiver=(?P<receiver>\S+)'
fields:
thread_info: "string"
buffers: "int"
file_count: "int"
size_mb: "float"
chunk_size: "int"
streams: "int"
target_rate: "float"
protocol: "string"
source: "string"
receiver: "string"
- name: "transfer_start_generic"
regex: 'out: Start transfer (?P<thread_info>\d+/\d+), src=(?P<source>[^ ]*) dest=(?P<destination>[^ ]*) item\[0\]=(?P<item0>[^ ]*) count=(?P<count>\d+)'
fields:
thread_info: "string"
source: "string"
destination: "string"
item0: "string"
count: "int"
# ===========================================================================
# Transfer Job Manager (TJM)
# Deckt ab: tjmServicePattern, tjmTransferNamePattern, tjmTransferIDPattern1/2
# ===========================================================================
transfer-job-manager:
extractors:
- name: "service_log_base"
regex: '^(?P<timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d{3})\s+(?P<log_level>\S+)\s+(?P<pid>\d+).*?\[(?P<correlation_id>[^\]]*)\]\s+\[(?P<username>[^\]]*)\]\s+\[(?P<thread_id>[^\]]*)\]\s+(?P<java_class>.*?)\s+:\s+(?P<message>.*)'
fields:
timestamp: "time:2006-01-02 15:04:05.000"
log_level: "string"
pid: "int"
correlation_id: "string"
username: "string"
thread_id: "string"
java_class: "string"
message: "string"
- name: "transfer_name_info"
regex: '^(?P<transfer_name_raw>\d{8}T\d{6}-[A-Za-z0-9]+-.+?-(?:in|out)) ?: (?P<message>.*)$'
fields:
transfer_name_raw: "string"
message: "string"
- name: "transfer_id_mid"
regex: '(?P<transfer_id>\w{8}-\w{4}-\w{4}-\w{4}-\w{12}).*?(?P<message>.*)'
fields:
transfer_id: "string"
message: "string"
- name: "transfer_id_prefixed"
regex: '(?P<prefix>.*)(?P<transfer_id>\w{8}-\w{4}-\w{4}-\w{4}-\w{12}).*?(?P<message>.*)'
fields:
prefix: "string"
transfer_id: "string"
message: "string"
# ===========================================================================
# Access Manager & TCC
# Deckt ab: amServicePattern, tccServicePattern
# ===========================================================================
access-manager:
extractors:
- name: "spring_boot_log"
regex: '^(?P<timestamp>\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:\.\d+)?Z)\s+(?P<log_level>\w+)\s+(?P<pid>\d+)\s+---\s+\[\s*(?P<thread_id>[^\]]*)\]\s+(?P<logger>[\w\.]+)\s*:\s+(?P<message>.*)$'
fields:
timestamp: "time:2006-01-02T15:04:05.000000Z"
log_level: "string"
pid: "int"
thread_id: "string"
logger: "string"
message: "string"
tixel-control-center:
extractors:
- name: "spring_boot_log"
regex: '^(?P<timestamp>\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:\.\d+)?Z)\s+(?P<log_level>\w+)\s+(?P<pid>\d+)\s+---\s+\[\s*(?P<thread_id>[^\]]*)\]\s+(?P<logger>[\w\.]+)\s*:\s+(?P<message>.*)$'
fields:
timestamp: "time:2006-01-02T15:04:05.000000Z"
log_level: "string"
pid: "int"
thread_id: "string"
logger: "string"
message: "string"
# ===========================================================================
# Nginx
# Deckt ab: nginxAccessPattern
# ===========================================================================
nginx:
extractors:
- name: "access_log"
regex: '^(?P<client_ip>\S+)\s+\S+\s+(?P<remote_user>\S+)\s+\[(?P<timestamp_nginx>[^\]]+)\]\s+"(?P<request>[^"]+)"\s+(?P<status_code>\d+)\s+(?P<bytes_sent>\d+|-)\s*(?:"(?P<referer>[^"]*)"\s+"(?P<user_agent>[^"]*)")?'
fields:
client_ip: "string"
remote_user: "string"
timestamp_nginx: "string"
request: "string"
status_code: "int"
bytes_sent: "int"
referer: "string"
user_agent: "string"

View file

@ -0,0 +1,57 @@
#!/bin/bash
OUTPUT_FILE="/var/log/thesis_training_metrics.csv"
IFACE="ens4"
PING_TARGET="10.10.2.10"
echo "timestamp,cpu_user,cpu_sys,cpu_wait,ram_used_mb,disk_read_iops,disk_write_iops,net_rx_kb_s,net_tx_kb_s,rtt_ms" >"$OUTPUT_FILE"
get_disk_iops() {
awk '/(sd[a-z]|vd[a-z] |nvme[0-9]n[0-9])$/ {r+=$4; w+=$8} END {print r+0,w+0}' /proc/diskstats
}
get_net_bytes() {
cat "/sys/class/net/$IFACE/statistics/$1"
}
OLD_RX=$(get_net_bytes rx_bytes)
OLD_TX=$(get_net_bytes tx_bytes)
read OLD_DR OLD_DW <<<$(get_disk_iops)
OLD_TS=$(date +%s%N)
while true; do
VMSTAT=$(vmstat 1 2 | tail -1)
CPU_US=$(echo "$VMSTAT" | awk '{print $13}')
CPU_SY=$(echo "$VMSTAT" | awk '{print $14}')
CPU_WA=$(echo "$VMSTAT" | awk '{print $16}')
RAM_USED=$(free -m | awk '/Mem:/ {print $3}')
RTT=$(ping -c 1 -W 1 -q "$PING_TARGET" | awk -F'/' '/rtt/ {printf "%.2f", $4}')
[ -z "$RTT" ] && RTT="0.0"
NEW_TS=$(date +%s%N)
NEW_RX=$(get_net_bytes rx_bytes)
NEW_TX=$(get_net_bytes tx_bytes)
read NEW_DR NEW_DW <<<$(get_disk_iops)
DT=$(echo "scale=3; ($NEW_TS - $OLD_TS) / 1000000000" | bc)
RX_RATE=$(echo "scale=2; ($NEW_RX - $OLD_RX) / 1024 / $DT" | bc)
TX_RATE=$(echo "scale=2; ($NEW_TX - $OLD_TX) / 1024 / $DT" | bc)
READ_IOPS=$(echo "scale=2; ($NEW_DR - $OLD_DR) / $DT" | bc)
WRITE_IOPS=$(echo "scale=2; ($NEW_DW - $OLD_DW) / $DT" | bc)
TIMESTAMP=$(date +"%Y-%m-%d %H:%M:%S")
echo "$TIMESTAMP,$CPU_US,$CPU_SY,$CPU_WA,$RAM_USED,$READ_IOPS,$WRITE_IOPS,$RX_RATE,$TX_RATE,$RTT" >>"$OUTPUT_FILE"
OLD_RX=$NEW_RX
OLD_TX=$NEW_TX
OLD_DR=$NEW_DR
OLD_DW=$NEW_DW
OLD_TS=$NEW_TS
sleep 5
done

View file

@ -0,0 +1,63 @@
# #!/bin/bash
#
# OUTPUT_FILE="/var/log/thesis_resource_baseline.csv"
#
# if [ ! -f "$OUTPUT_FILE" ]; then
# echo "timestamp,cpu_percent,ram_kb,command" > "$OUTPUT_FILE"
# fi
#
# while true; do
# PID=$(pgrep -x "watch-tool")
#
# if [ ! -z "$PID" ]; then
# TIMESTAMP=$(date +"%Y-%m-%d %H:%M:%S")
#
# STATS=$(pidstat -p $PID -u -r 1 1 | tail -1)
#
# eval $(ps -p $PID -o %cpu,rss --no-headers | awk '{print "CPU="$1; print "RSS_KB="$2}')
#
# echo "$TIMESTAMP,$CPU,$RSS_KB,watch-tool" >> "$OUTPUT_FILE"
# fi
#
# sleep 10
# done
OUTPUT_FILE="/var/log/thesis_resource_usage.csv"
HEADER="timestamp,system_cpu_usage,system_ram_used_mb,tixstream_cpu,tixstream_rss_mb,watchtool_cpu,watchtool_rss_mb"
if [ ! -f "$OUTPUT_FILE" ]; then
echo "$HEADER" >"$OUTPUT_FILE"
fi
get_process_stats() {
local pattern=$1
local pid=$(pgrep -f "$pattern" | head -1)
if [ ! -z "$pid" ]; then
ps -p "$pid" -o %cpu,rss --no-headers | awk '{printf "%.2f,%.2f", $1, $2/1024}'
else
echo "0.00,0.00"
fi
}
get_system_stats() {
local cpu_sys=$(top -bn1 | grep "Cpu(s)" | awk '{print $2 + $4}')
local ram_sys=$(free -m | awk '/Mem:/ {print $3}')
echo "$cpu_sys,$ram_sys"
}
while true; do
TIMESTAMP=$(date +"%Y-%m-%d %H:%M:%S")
SYS_STATS=$(get_system_stats)
TIX_STATS=$(get_process_stats "transfer-job-manager")
TOOL_STATS=$(get_process_stats "watch-tool")
echo "$TIMESTAMP,$SYS_STATS,$TIX_STATS,$TOOL_STATS" >>"$OUTPUT_FILE"
sleep 5
done

View file

@ -0,0 +1,68 @@
#!/bin/bash
W1="10.10.2.10" # Worker 1 (Big Files)
W2="10.10.3.10" # Worker 2 (Small Files)
MY_IP="10.10.1.10" # Worker 0 (Myself)
# TIXstream Config
TIX="/opt/tixel/tixstream/bin/tix"
# Quelle Testdaten (auf W0)
SRC_LARGE="/data/testfiles/large/5G.dat"
SRC_SMALL="/data/testfiles/small/"
SSH_OPTS="-o StrictHostKeyChecking=no -i /home/baUser/.ssh/id_rsa"
SCENARIO_LOG="/var/log/thesis_scenario_ground_truth.csv"
echo "timestamp,event_type,description" >> "$SCENARIO_LOG"
log_event() {
echo "$(date +"%Y-%m-%d %H:%M:%S"),$1,$2" >> "$SCENARIO_LOG"
}
echo "Starting Thesis-Baseline-Scenario..."
while true; do
log_event "NORMAL_LOAD" "Upload Large File to W1"
$TIX submit -s "$SRC_LARGE" -t "tixel://$W1/incoming/" --wait
sleep 10
log_event "NORMAL_LOAD" "Upload Small Files to W2"
$TIX submit -s "$SRC_SMALL" -t "tixel://$W2/incoming/" -r --wait
sleep 10
# ---------------------------------------------------------
# 3. Interferenz: Eingehender Traffic (W1 -> W0)
# Wir triggern das REMOTE auf W1 via SSH!
# ---------------------------------------------------------
log_event "INTERFERENCE" "Incoming Transfer from W1"
# W1 sendet eine generierte Datei an mich zurück
# (Voraussetzung: Auf W1 existiert /data/testfiles/large/1G.dat)
ssh $SSH_OPTS baUser@$W1 "$TIX submit -s /data/testfiles/large/1G.dat -t tixel://$MY_IP/incoming/ --wait"
# Aufräumen bei mir (W0 ist KEIN Blackhole, wir löschen kontrolliert)
rm -rf /data/tixel/incoming/*
sleep 10
# ---------------------------------------------------------
# 4. Stress-Test: Bidirektional (W0->W1 UND W2->W0)
# Simuliert "Busy Day" (Normal, aber hohe Last)
# ---------------------------------------------------------
log_event "HIGH_LOAD" "Bidirectional Transfer"
# Start Upload Background
$TIX submit -s "$SRC_LARGE" -t "tixel://$W1/incoming/" &
PID_UP=$!
# Trigger Download Background
ssh $SSH_OPTS baUser@$W2 "$TIX submit -s /data/testfiles/small/ -t tixel://$MY_IP/incoming/ -r" &
# Warten bis Upload fertig
wait $PID_UP
# Aufräumen
rm -rf /data/tixel/incoming/*
sleep 30
done

View file

@ -0,0 +1,85 @@
#!/bin/bash
W0="thesis-worker-0"
W1="thesis-worker-1"
W2="thesis-worker-2"
PORT="60000"
AUTH="YWRtaW46dmVyeXNlY3JldA=="
LOG="/var/log/thesis_scenario_ground_truth.csv"
[ ! -f "$LOG" ] && echo "timestamp,action,source,target" >"$LOG"
log_gt() { echo "$(date +"%Y-%m-%d %H:%M:%S"),$1,$2,$3" >>"$LOG"; }
wait_for_api() {
local host=$1
echo "Prüfe Verfügbarkeit von TIXstream auf $host..."
while true; do
HTTP_CODE=$(curl --write-out "%{http_code}" --silent --output /dev/null \
--url "http://$host:$PORT/transfer-job-manager/v1/jobs?limit=1" \
--header "Authorization: Basic $AUTH")
if [ "$HTTP_CODE" -eq 200 ]; then
echo "API auf $host ist ONLINE (Code 200)."
break
else
echo "Warte auf API ($host)... Status: $HTTP_CODE. Retry in 10s."
sleep 10
fi
done
}
trigger_job() {
local host=$1
local dest_sys=$2
local file_uri=$3
local desc=$4
DATA=$(jq -n \
--arg desc "$desc" \
--arg dest_sys "$dest_sys" \
--arg file "$file_uri" \
'{
"description": $desc,
"schedule": "NOW",
"destination_system": $dest_sys,
"destination_share": "local_sync_destination",
"source_file_uris": [$file],
"wait_for_publish": false,
"flat_file_mode_disabled": false
}')
curl --request POST \
--url "http://$host:$PORT/transfer-job-manager/v1/jobs" \
--header "Authorization: Basic $AUTH" \
--header 'Content-Type: application/json' \
--data "$DATA" --silent >/dev/null
}
wait_for_api "$W0"
wait_for_api "$W1"
wait_for_api "$W2"
echo "Alle Systeme bereit. Starte Baseline-Szenario Loop..."
log_gt "SYSTEM_READY" "ALL" "Orchestrator started"
INCOMING_DIR="/local_testdata/test-destination"
while true; do
log_gt "START_LARGE" "W0" "W1"
trigger_job "$W0" "Worker Node 1" "local_sync_source/5g.mxf" "Thesis Large File"
sleep 60
log_gt "START_SMALL" "W0" "W2"
for i in {1..10}; do
trigger_job "$W0" "Worker Node 2" "local_sync_source/small_$i.dat" "Thesis Small File $i"
done
sleep 30
log_gt "START_INTERFERENCE" "W1" "W0"
trigger_job "$W1" "Worker Node 0" "local_sync_source/5g.mxf" "Interference Load"
sleep 60
rm -rf "${INCOMING_DIR:?}/"*
done

View file

@ -0,0 +1,13 @@
[Unit]
Description=Thesis Watch Tool - Metrics Collector
After=network.target
[Service]
User=root
WorkingDirectory=/opt/watch-tool
ExecStart=/opt/watch-tool/watch-tool
Restart=always
RestartSec=5
[Install]
WantedBy=multi-user.target