add Final Infrastructure Setup
This commit is contained in:
commit
7733dde658
174 changed files with 204949 additions and 0 deletions
|
|
@ -0,0 +1,6 @@
|
|||
---
|
||||
- name: Restart Watch-Tool
|
||||
systemd:
|
||||
name: watch-tool
|
||||
state: restarted
|
||||
daemon_reload: yes
|
||||
109
infrastructure/ansible/roles/monitoring_baseline/tasks/main.yml
Normal file
109
infrastructure/ansible/roles/monitoring_baseline/tasks/main.yml
Normal file
|
|
@ -0,0 +1,109 @@
|
|||
---
|
||||
- name: Kopiere Journal Dumper Script
|
||||
template:
|
||||
src: journal_dumper.sh.j2
|
||||
dest: /usr/local/bin/journal_dumper.sh
|
||||
mode: "0755"
|
||||
|
||||
- name: Deploye Systemd Services für Monitoring
|
||||
copy:
|
||||
dest: "/etc/systemd/system/{{ item }}.service"
|
||||
content: |
|
||||
[Unit]
|
||||
Description=Thesis {{ item }} Service
|
||||
After=network.target
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
ExecStart=/usr/local/bin/{{ item }}.sh
|
||||
Restart=always
|
||||
User=root
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
loop:
|
||||
- journal_dumper
|
||||
|
||||
- name: Starte und aktiviere Monitoring Services
|
||||
systemd:
|
||||
name: "{{ item }}"
|
||||
state: started
|
||||
enabled: yes
|
||||
daemon_reload: yes
|
||||
loop:
|
||||
- journal_dumper
|
||||
|
||||
- name: Erstelle Temp-Verzeichnis
|
||||
tempfile:
|
||||
state: directory
|
||||
register: tmpdir
|
||||
|
||||
- name: Download tar.gz
|
||||
get_url:
|
||||
url: "https://codeberg.org/Pata1704/metrics-collector/releases/download/v1.1.0/metrics-collector_Linux_x86_64.tar.gz"
|
||||
dest: "{{ tmpdir.path }}/metrics-collector.tar.gz"
|
||||
retries: 3
|
||||
delay: 5
|
||||
|
||||
- name: Entpacke Binary
|
||||
unarchive:
|
||||
src: "{{ tmpdir.path }}/metrics-collector.tar.gz"
|
||||
dest: "{{ tmpdir.path }}"
|
||||
remote_src: yes
|
||||
|
||||
- name: Installiere Binary
|
||||
copy:
|
||||
src: "{{ tmpdir.path }}/metrics-collector"
|
||||
dest: /usr/local/bin/metrics-collector
|
||||
mode: "0755"
|
||||
remote_src: yes
|
||||
|
||||
- name: Cleanup Temp
|
||||
file:
|
||||
path: "{{ tmpdir.path }}"
|
||||
state: absent
|
||||
|
||||
- name: Deploy systemd Service
|
||||
copy:
|
||||
dest: /etc/systemd/system/metrics-collector.service
|
||||
mode: "0644"
|
||||
content: |
|
||||
[Unit]
|
||||
Description=Thesis Metrics Collector
|
||||
After=network-online.target
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
ExecStart=/usr/local/bin/metrics-collector
|
||||
Restart=always
|
||||
RestartSec=10
|
||||
TimeoutStopSec=60
|
||||
KillMode=mixed
|
||||
KillSignal=SIGTERM
|
||||
StandardOutput=journal
|
||||
StandardError=journal
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
register: service_file
|
||||
|
||||
- name: Reload systemd
|
||||
systemd:
|
||||
daemon_reload: yes
|
||||
when: service_file.changed
|
||||
|
||||
- name: Enable und Start Service
|
||||
systemd:
|
||||
name: metrics-collector
|
||||
state: started
|
||||
enabled: yes
|
||||
|
||||
- name: Prüfe Service-Status
|
||||
command: systemctl is-active metrics-collector
|
||||
register: service_check
|
||||
changed_when: false
|
||||
failed_when: service_check.stdout != "active"
|
||||
|
||||
- name: Zeige Erfolg
|
||||
debug:
|
||||
msg: "Metrics Collector erfolgreich deployed!"
|
||||
|
|
@ -0,0 +1,9 @@
|
|||
#!/bin/bash
|
||||
TARGET="/local_testdata/test-destination"
|
||||
|
||||
echo "Starte Blackhole auf $TARGET"
|
||||
|
||||
while true; do
|
||||
find "$TARGET" -type f -mmin +0.5 -delete
|
||||
sleep 10
|
||||
done
|
||||
|
|
@ -0,0 +1,111 @@
|
|||
export:
|
||||
enabled: true
|
||||
batch_size: 100
|
||||
export_interval: "30s"
|
||||
retry_attempts: 5
|
||||
retry_backoff: "10s"
|
||||
health_check_interval: "60s"
|
||||
|
||||
localstorage:
|
||||
enabled: true
|
||||
db_path: "./watch.db"
|
||||
rotation:
|
||||
max_sizes_bytes: 100 * 1024 * 1024
|
||||
max_age_hours: 24
|
||||
max_files: 3
|
||||
check_interval_minuntes: 5
|
||||
archive_dir: ""
|
||||
|
||||
elasticsearch:
|
||||
enabled: false
|
||||
url: "http://10.0.0.99:9200"
|
||||
index: "watch"
|
||||
username: "your-configured-user"
|
||||
password: "your-super-secret-password"
|
||||
api_key: "your-api-key"
|
||||
timeout: 30
|
||||
|
||||
web_service:
|
||||
enabled: true
|
||||
host: "0.0.0.0"
|
||||
port: 9090
|
||||
|
||||
system_metrics:
|
||||
enabled: true
|
||||
collect_cpu: true
|
||||
collect_memory: true
|
||||
collect_disk: true
|
||||
collect_network: true
|
||||
disk_paths:
|
||||
- "/"
|
||||
- "/var"
|
||||
- "/home"
|
||||
network_interfaces:
|
||||
- "ens6"
|
||||
collect_network_connections: true
|
||||
collect_load_average: true
|
||||
collect_tcp_stats: true
|
||||
collect_filehandles: true
|
||||
collect_disk_io: true
|
||||
collect_network_latency: true
|
||||
collect_bandwidth_usage: true
|
||||
transfer_ports: 60003
|
||||
latency_test_hosts: "www.google.de"
|
||||
|
||||
poll_interval_seconds: 30
|
||||
patterns_file: "./configs/patterns.yaml"
|
||||
|
||||
logging:
|
||||
level: "info"
|
||||
file_path: "/var/log/system-monitor.log"
|
||||
|
||||
drain3:
|
||||
enabled: true
|
||||
state_dir: "./drain3_states"
|
||||
depth: 4
|
||||
sim_th: 0.4
|
||||
max_children: 100
|
||||
max_clusters: 1000
|
||||
save_interval: 60
|
||||
|
||||
services:
|
||||
- name: "nginx"
|
||||
service: "nginx.service"
|
||||
enabled: true
|
||||
since_time: ""
|
||||
priority: "info"
|
||||
|
||||
tools:
|
||||
- name: "nginx-access"
|
||||
log_file: "/var/log/nginx/access.log"
|
||||
enabled: true
|
||||
buffer_size: 200
|
||||
format:
|
||||
name: "nginx_combined"
|
||||
pattern: '^(?P<client_ip>\S+) \S+ \S+ \[(?P<timestamp>[^\]]+)\] "(?P<method>\S+) (?P<path>\S+) (?P<protocol>\S+)" (?P<status>\d+) (?P<body_bytes>\d+) "(?P<referer>[^"]*)" "(?P<user_agent>[^"]*)"'
|
||||
fields:
|
||||
client_ip: "remote_addr"
|
||||
timestamp: "time_local"
|
||||
method: "request_method"
|
||||
path: "request_uri"
|
||||
protocol: "server_protocol"
|
||||
status: "status"
|
||||
body_bytes: "body_bytes_sent"
|
||||
referer: "http_referer"
|
||||
user_agent: "http_user_agent"
|
||||
|
||||
- name: "nginx-error"
|
||||
log_file: "/var/log/nginx/error.log"
|
||||
enabled: true
|
||||
buffer_size: 100
|
||||
format:
|
||||
name: "nginx_error"
|
||||
pattern: '^(?P<timestamp>\d{4}/\d{2}/\d{2} \d{2}:\d{2}:\d{2}) \[(?P<level>\w+)\] (?P<pid>\d+)#(?P<tid>\d+): (?P<message>.*)'
|
||||
fields:
|
||||
timestamp: "time"
|
||||
level: "log_level"
|
||||
pid: "process_id"
|
||||
tid: "thread_id"
|
||||
message: "error_message"
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,15 @@
|
|||
#!/bin/bash
|
||||
# Verzeichnis, in dem TIXstream die Dateien ablegt (anpassen!)
|
||||
INCOMING_DIR="/data/tixel/incoming"
|
||||
|
||||
echo "Starte Blackhole-Service für $INCOMING_DIR..."
|
||||
|
||||
while true; do
|
||||
# Lösche alle Dateien, die älter als 1 Minute sind (damit wir laufende Transfers nicht killen)
|
||||
# Oder radikaler: Sobald der Transfer fertig ist (Tixstream benennt temporäre Dateien oft um).
|
||||
# Wir nehmen hier sicherheitshalber Dateien, auf die seit 10s nicht zugegriffen wurde.
|
||||
|
||||
find "$INCOMING_DIR" -type f -mmin +0.1 -delete
|
||||
|
||||
sleep 10
|
||||
done
|
||||
|
|
@ -0,0 +1,11 @@
|
|||
#!/bin/bash
|
||||
# Speichert Logs der relevanten Services (TJM und Tixstream Engine)
|
||||
# -f: Follow (Live)
|
||||
# -u: Unit filter
|
||||
# --no-pager: Wichtig für Background-Prozesse
|
||||
# -o json: Speichert strukturiertes JSON (perfekt für deine ML-Pipeline!)
|
||||
|
||||
LOGfile="/var/log/thesis_tixstream_logs.json"
|
||||
|
||||
# Wir loggen Tixstream Engine UND den Job Manager
|
||||
exec journalctl -u tixstream -u transfer-job-manager -f -o json --no-pager >> "$LOGfile"
|
||||
|
|
@ -0,0 +1,165 @@
|
|||
patterns:
|
||||
# ===========================================================================
|
||||
# Common / Shared Patterns
|
||||
# ===========================================================================
|
||||
common:
|
||||
extractors:
|
||||
- name: "syslog_header"
|
||||
regex: '^(\w{3} \d{2} \d{2}:\d{2}:\d{2}) (?P<hostname>[^\s]+) (?P<process_info>[^:]+):\s*(?P<message_rest>.*)$'
|
||||
fields:
|
||||
syslog_timestamp: "time:Jan 02 15:04:05"
|
||||
hostname: "string"
|
||||
process_info: "string"
|
||||
message_rest: "string"
|
||||
|
||||
- name: "timestamp_rfc3339"
|
||||
regex: '(?P<timestamp>\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:\.\d+)?Z?)'
|
||||
fields:
|
||||
timestamp: "time:2006-01-02T15:04:05.000000Z"
|
||||
|
||||
# ===========================================================================
|
||||
# TIXstream Service
|
||||
# Deckt ab: tsServicePattern, tsTransferIDPattern, tsDetailPattern1-4
|
||||
# ===========================================================================
|
||||
tixstream:
|
||||
extractors:
|
||||
- name: "service_log_base"
|
||||
regex: '^(?P<log_level>\S+)\s+(?P<timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d{6})\s+(?P<message>.*)'
|
||||
fields:
|
||||
log_level: "string"
|
||||
timestamp: "time:2006-01-02 15:04:05.000000"
|
||||
message: "string"
|
||||
|
||||
- name: "transfer_id_extraction"
|
||||
regex: '^(?P<transfer_id>\w{8}-\w{4}-\w{4}-\w{4}-\w{12})\s+(?P<message>.*)'
|
||||
fields:
|
||||
transfer_id: "string"
|
||||
message: "string"
|
||||
|
||||
- name: "transfer_start_in"
|
||||
regex: 'in: Transfer start (?P<thread_info>\d+/\d+) buffers=(?P<buffers>\d+) files=(?P<file_count>\d+) size=(?P<size_mb>[0-9.]+) MByte chunksize=(?P<chunk_size>\d+) streams=(?P<streams>\d+) target-datarate=(?P<target_rate>[0-9.]+) MByte/s protocol=(?P<protocol>\w+) dest=(?P<destination>\S+) sender-id=(?P<sender_id>\S+)'
|
||||
fields:
|
||||
thread_info: "string" # z.B. "1/4" - Typisierung hier schwierig, also String
|
||||
buffers: "int"
|
||||
file_count: "int"
|
||||
size_mb: "float"
|
||||
chunk_size: "int"
|
||||
streams: "int"
|
||||
target_rate: "float"
|
||||
protocol: "string"
|
||||
destination: "string"
|
||||
sender_id: "string"
|
||||
direction: "string" # Wir können statische Felder im Parser injecten oder hier als "implizit" betrachten
|
||||
|
||||
- name: "transfer_start_remote_out"
|
||||
regex: 'out: Start remote transfer to (?P<target>[^\s]+) request executed, duration=(?P<duration>[0-9.]+) s'
|
||||
fields:
|
||||
target: "string"
|
||||
duration: "float"
|
||||
|
||||
- name: "transfer_start_out"
|
||||
regex: 'out: Transfer start (?P<thread_info>\d+/\d+) buffers=(?P<buffers>\d+) files=(?P<file_count>\d+) size=(?P<size_mb>[0-9.]+) MByte chunksize=(?P<chunk_size>\d+) streams=(?P<streams>\d+) target-datarate=(?P<target_rate>[0-9.]+) MByte/s protocol=(?P<protocol>\w+) src=(?P<source>\S+) receiver=(?P<receiver>\S+)'
|
||||
fields:
|
||||
thread_info: "string"
|
||||
buffers: "int"
|
||||
file_count: "int"
|
||||
size_mb: "float"
|
||||
chunk_size: "int"
|
||||
streams: "int"
|
||||
target_rate: "float"
|
||||
protocol: "string"
|
||||
source: "string"
|
||||
receiver: "string"
|
||||
|
||||
- name: "transfer_start_generic"
|
||||
regex: 'out: Start transfer (?P<thread_info>\d+/\d+), src=(?P<source>[^ ]*) dest=(?P<destination>[^ ]*) item\[0\]=(?P<item0>[^ ]*) count=(?P<count>\d+)'
|
||||
fields:
|
||||
thread_info: "string"
|
||||
source: "string"
|
||||
destination: "string"
|
||||
item0: "string"
|
||||
count: "int"
|
||||
|
||||
# ===========================================================================
|
||||
# Transfer Job Manager (TJM)
|
||||
# Deckt ab: tjmServicePattern, tjmTransferNamePattern, tjmTransferIDPattern1/2
|
||||
# ===========================================================================
|
||||
transfer-job-manager:
|
||||
extractors:
|
||||
- name: "service_log_base"
|
||||
regex: '^(?P<timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d{3})\s+(?P<log_level>\S+)\s+(?P<pid>\d+).*?\[(?P<correlation_id>[^\]]*)\]\s+\[(?P<username>[^\]]*)\]\s+\[(?P<thread_id>[^\]]*)\]\s+(?P<java_class>.*?)\s+:\s+(?P<message>.*)'
|
||||
fields:
|
||||
timestamp: "time:2006-01-02 15:04:05.000"
|
||||
log_level: "string"
|
||||
pid: "int"
|
||||
correlation_id: "string"
|
||||
username: "string"
|
||||
thread_id: "string"
|
||||
java_class: "string"
|
||||
message: "string"
|
||||
|
||||
- name: "transfer_name_info"
|
||||
regex: '^(?P<transfer_name_raw>\d{8}T\d{6}-[A-Za-z0-9]+-.+?-(?:in|out)) ?: (?P<message>.*)$'
|
||||
fields:
|
||||
transfer_name_raw: "string"
|
||||
message: "string"
|
||||
|
||||
- name: "transfer_id_mid"
|
||||
regex: '(?P<transfer_id>\w{8}-\w{4}-\w{4}-\w{4}-\w{12}).*?(?P<message>.*)'
|
||||
fields:
|
||||
transfer_id: "string"
|
||||
message: "string"
|
||||
|
||||
- name: "transfer_id_prefixed"
|
||||
regex: '(?P<prefix>.*)(?P<transfer_id>\w{8}-\w{4}-\w{4}-\w{4}-\w{12}).*?(?P<message>.*)'
|
||||
fields:
|
||||
prefix: "string"
|
||||
transfer_id: "string"
|
||||
message: "string"
|
||||
|
||||
# ===========================================================================
|
||||
# Access Manager & TCC
|
||||
# Deckt ab: amServicePattern, tccServicePattern
|
||||
# ===========================================================================
|
||||
access-manager:
|
||||
extractors:
|
||||
- name: "spring_boot_log"
|
||||
regex: '^(?P<timestamp>\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:\.\d+)?Z)\s+(?P<log_level>\w+)\s+(?P<pid>\d+)\s+---\s+\[\s*(?P<thread_id>[^\]]*)\]\s+(?P<logger>[\w\.]+)\s*:\s+(?P<message>.*)$'
|
||||
fields:
|
||||
timestamp: "time:2006-01-02T15:04:05.000000Z"
|
||||
log_level: "string"
|
||||
pid: "int"
|
||||
thread_id: "string"
|
||||
logger: "string"
|
||||
message: "string"
|
||||
|
||||
tixel-control-center:
|
||||
extractors:
|
||||
- name: "spring_boot_log"
|
||||
regex: '^(?P<timestamp>\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:\.\d+)?Z)\s+(?P<log_level>\w+)\s+(?P<pid>\d+)\s+---\s+\[\s*(?P<thread_id>[^\]]*)\]\s+(?P<logger>[\w\.]+)\s*:\s+(?P<message>.*)$'
|
||||
fields:
|
||||
timestamp: "time:2006-01-02T15:04:05.000000Z"
|
||||
log_level: "string"
|
||||
pid: "int"
|
||||
thread_id: "string"
|
||||
logger: "string"
|
||||
message: "string"
|
||||
|
||||
# ===========================================================================
|
||||
# Nginx
|
||||
# Deckt ab: nginxAccessPattern
|
||||
# ===========================================================================
|
||||
nginx:
|
||||
extractors:
|
||||
- name: "access_log"
|
||||
regex: '^(?P<client_ip>\S+)\s+\S+\s+(?P<remote_user>\S+)\s+\[(?P<timestamp_nginx>[^\]]+)\]\s+"(?P<request>[^"]+)"\s+(?P<status_code>\d+)\s+(?P<bytes_sent>\d+|-)\s*(?:"(?P<referer>[^"]*)"\s+"(?P<user_agent>[^"]*)")?'
|
||||
fields:
|
||||
client_ip: "string"
|
||||
remote_user: "string"
|
||||
timestamp_nginx: "string"
|
||||
request: "string"
|
||||
status_code: "int"
|
||||
bytes_sent: "int"
|
||||
referer: "string"
|
||||
user_agent: "string"
|
||||
|
||||
|
|
@ -0,0 +1,57 @@
|
|||
#!/bin/bash
|
||||
|
||||
OUTPUT_FILE="/var/log/thesis_training_metrics.csv"
|
||||
IFACE="ens4"
|
||||
PING_TARGET="10.10.2.10"
|
||||
|
||||
echo "timestamp,cpu_user,cpu_sys,cpu_wait,ram_used_mb,disk_read_iops,disk_write_iops,net_rx_kb_s,net_tx_kb_s,rtt_ms" >"$OUTPUT_FILE"
|
||||
|
||||
get_disk_iops() {
|
||||
awk '/(sd[a-z]|vd[a-z] |nvme[0-9]n[0-9])$/ {r+=$4; w+=$8} END {print r+0,w+0}' /proc/diskstats
|
||||
}
|
||||
|
||||
get_net_bytes() {
|
||||
cat "/sys/class/net/$IFACE/statistics/$1"
|
||||
}
|
||||
|
||||
OLD_RX=$(get_net_bytes rx_bytes)
|
||||
OLD_TX=$(get_net_bytes tx_bytes)
|
||||
read OLD_DR OLD_DW <<<$(get_disk_iops)
|
||||
OLD_TS=$(date +%s%N)
|
||||
|
||||
while true; do
|
||||
VMSTAT=$(vmstat 1 2 | tail -1)
|
||||
CPU_US=$(echo "$VMSTAT" | awk '{print $13}')
|
||||
CPU_SY=$(echo "$VMSTAT" | awk '{print $14}')
|
||||
CPU_WA=$(echo "$VMSTAT" | awk '{print $16}')
|
||||
|
||||
RAM_USED=$(free -m | awk '/Mem:/ {print $3}')
|
||||
|
||||
RTT=$(ping -c 1 -W 1 -q "$PING_TARGET" | awk -F'/' '/rtt/ {printf "%.2f", $4}')
|
||||
[ -z "$RTT" ] && RTT="0.0"
|
||||
|
||||
NEW_TS=$(date +%s%N)
|
||||
NEW_RX=$(get_net_bytes rx_bytes)
|
||||
NEW_TX=$(get_net_bytes tx_bytes)
|
||||
read NEW_DR NEW_DW <<<$(get_disk_iops)
|
||||
|
||||
DT=$(echo "scale=3; ($NEW_TS - $OLD_TS) / 1000000000" | bc)
|
||||
|
||||
RX_RATE=$(echo "scale=2; ($NEW_RX - $OLD_RX) / 1024 / $DT" | bc)
|
||||
TX_RATE=$(echo "scale=2; ($NEW_TX - $OLD_TX) / 1024 / $DT" | bc)
|
||||
|
||||
READ_IOPS=$(echo "scale=2; ($NEW_DR - $OLD_DR) / $DT" | bc)
|
||||
WRITE_IOPS=$(echo "scale=2; ($NEW_DW - $OLD_DW) / $DT" | bc)
|
||||
|
||||
TIMESTAMP=$(date +"%Y-%m-%d %H:%M:%S")
|
||||
|
||||
echo "$TIMESTAMP,$CPU_US,$CPU_SY,$CPU_WA,$RAM_USED,$READ_IOPS,$WRITE_IOPS,$RX_RATE,$TX_RATE,$RTT" >>"$OUTPUT_FILE"
|
||||
|
||||
OLD_RX=$NEW_RX
|
||||
OLD_TX=$NEW_TX
|
||||
OLD_DR=$NEW_DR
|
||||
OLD_DW=$NEW_DW
|
||||
OLD_TS=$NEW_TS
|
||||
|
||||
sleep 5
|
||||
done
|
||||
|
|
@ -0,0 +1,63 @@
|
|||
# #!/bin/bash
|
||||
#
|
||||
# OUTPUT_FILE="/var/log/thesis_resource_baseline.csv"
|
||||
#
|
||||
# if [ ! -f "$OUTPUT_FILE" ]; then
|
||||
# echo "timestamp,cpu_percent,ram_kb,command" > "$OUTPUT_FILE"
|
||||
# fi
|
||||
#
|
||||
# while true; do
|
||||
# PID=$(pgrep -x "watch-tool")
|
||||
#
|
||||
# if [ ! -z "$PID" ]; then
|
||||
# TIMESTAMP=$(date +"%Y-%m-%d %H:%M:%S")
|
||||
#
|
||||
# STATS=$(pidstat -p $PID -u -r 1 1 | tail -1)
|
||||
#
|
||||
# eval $(ps -p $PID -o %cpu,rss --no-headers | awk '{print "CPU="$1; print "RSS_KB="$2}')
|
||||
#
|
||||
# echo "$TIMESTAMP,$CPU,$RSS_KB,watch-tool" >> "$OUTPUT_FILE"
|
||||
# fi
|
||||
#
|
||||
# sleep 10
|
||||
# done
|
||||
|
||||
OUTPUT_FILE="/var/log/thesis_resource_usage.csv"
|
||||
HEADER="timestamp,system_cpu_usage,system_ram_used_mb,tixstream_cpu,tixstream_rss_mb,watchtool_cpu,watchtool_rss_mb"
|
||||
|
||||
if [ ! -f "$OUTPUT_FILE" ]; then
|
||||
echo "$HEADER" >"$OUTPUT_FILE"
|
||||
fi
|
||||
|
||||
get_process_stats() {
|
||||
local pattern=$1
|
||||
local pid=$(pgrep -f "$pattern" | head -1)
|
||||
|
||||
if [ ! -z "$pid" ]; then
|
||||
ps -p "$pid" -o %cpu,rss --no-headers | awk '{printf "%.2f,%.2f", $1, $2/1024}'
|
||||
else
|
||||
echo "0.00,0.00"
|
||||
fi
|
||||
}
|
||||
|
||||
get_system_stats() {
|
||||
local cpu_sys=$(top -bn1 | grep "Cpu(s)" | awk '{print $2 + $4}')
|
||||
|
||||
local ram_sys=$(free -m | awk '/Mem:/ {print $3}')
|
||||
|
||||
echo "$cpu_sys,$ram_sys"
|
||||
}
|
||||
|
||||
while true; do
|
||||
TIMESTAMP=$(date +"%Y-%m-%d %H:%M:%S")
|
||||
|
||||
SYS_STATS=$(get_system_stats)
|
||||
|
||||
TIX_STATS=$(get_process_stats "transfer-job-manager")
|
||||
|
||||
TOOL_STATS=$(get_process_stats "watch-tool")
|
||||
|
||||
echo "$TIMESTAMP,$SYS_STATS,$TIX_STATS,$TOOL_STATS" >>"$OUTPUT_FILE"
|
||||
|
||||
sleep 5
|
||||
done
|
||||
|
|
@ -0,0 +1,68 @@
|
|||
#!/bin/bash
|
||||
|
||||
W1="10.10.2.10" # Worker 1 (Big Files)
|
||||
W2="10.10.3.10" # Worker 2 (Small Files)
|
||||
MY_IP="10.10.1.10" # Worker 0 (Myself)
|
||||
|
||||
# TIXstream Config
|
||||
TIX="/opt/tixel/tixstream/bin/tix"
|
||||
# Quelle Testdaten (auf W0)
|
||||
SRC_LARGE="/data/testfiles/large/5G.dat"
|
||||
SRC_SMALL="/data/testfiles/small/"
|
||||
|
||||
SSH_OPTS="-o StrictHostKeyChecking=no -i /home/baUser/.ssh/id_rsa"
|
||||
|
||||
SCENARIO_LOG="/var/log/thesis_scenario_ground_truth.csv"
|
||||
echo "timestamp,event_type,description" >> "$SCENARIO_LOG"
|
||||
|
||||
log_event() {
|
||||
echo "$(date +"%Y-%m-%d %H:%M:%S"),$1,$2" >> "$SCENARIO_LOG"
|
||||
}
|
||||
|
||||
echo "Starting Thesis-Baseline-Scenario..."
|
||||
|
||||
while true; do
|
||||
log_event "NORMAL_LOAD" "Upload Large File to W1"
|
||||
$TIX submit -s "$SRC_LARGE" -t "tixel://$W1/incoming/" --wait
|
||||
|
||||
sleep 10
|
||||
|
||||
log_event "NORMAL_LOAD" "Upload Small Files to W2"
|
||||
$TIX submit -s "$SRC_SMALL" -t "tixel://$W2/incoming/" -r --wait
|
||||
|
||||
sleep 10
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# 3. Interferenz: Eingehender Traffic (W1 -> W0)
|
||||
# Wir triggern das REMOTE auf W1 via SSH!
|
||||
# ---------------------------------------------------------
|
||||
log_event "INTERFERENCE" "Incoming Transfer from W1"
|
||||
# W1 sendet eine generierte Datei an mich zurück
|
||||
# (Voraussetzung: Auf W1 existiert /data/testfiles/large/1G.dat)
|
||||
ssh $SSH_OPTS baUser@$W1 "$TIX submit -s /data/testfiles/large/1G.dat -t tixel://$MY_IP/incoming/ --wait"
|
||||
|
||||
# Aufräumen bei mir (W0 ist KEIN Blackhole, wir löschen kontrolliert)
|
||||
rm -rf /data/tixel/incoming/*
|
||||
|
||||
sleep 10
|
||||
|
||||
# ---------------------------------------------------------
|
||||
# 4. Stress-Test: Bidirektional (W0->W1 UND W2->W0)
|
||||
# Simuliert "Busy Day" (Normal, aber hohe Last)
|
||||
# ---------------------------------------------------------
|
||||
log_event "HIGH_LOAD" "Bidirectional Transfer"
|
||||
# Start Upload Background
|
||||
$TIX submit -s "$SRC_LARGE" -t "tixel://$W1/incoming/" &
|
||||
PID_UP=$!
|
||||
|
||||
# Trigger Download Background
|
||||
ssh $SSH_OPTS baUser@$W2 "$TIX submit -s /data/testfiles/small/ -t tixel://$MY_IP/incoming/ -r" &
|
||||
|
||||
# Warten bis Upload fertig
|
||||
wait $PID_UP
|
||||
|
||||
# Aufräumen
|
||||
rm -rf /data/tixel/incoming/*
|
||||
|
||||
sleep 30
|
||||
done
|
||||
|
|
@ -0,0 +1,85 @@
|
|||
#!/bin/bash
|
||||
|
||||
W0="thesis-worker-0"
|
||||
W1="thesis-worker-1"
|
||||
W2="thesis-worker-2"
|
||||
PORT="60000"
|
||||
AUTH="YWRtaW46dmVyeXNlY3JldA=="
|
||||
|
||||
LOG="/var/log/thesis_scenario_ground_truth.csv"
|
||||
[ ! -f "$LOG" ] && echo "timestamp,action,source,target" >"$LOG"
|
||||
|
||||
log_gt() { echo "$(date +"%Y-%m-%d %H:%M:%S"),$1,$2,$3" >>"$LOG"; }
|
||||
|
||||
wait_for_api() {
|
||||
local host=$1
|
||||
echo "Prüfe Verfügbarkeit von TIXstream auf $host..."
|
||||
|
||||
while true; do
|
||||
HTTP_CODE=$(curl --write-out "%{http_code}" --silent --output /dev/null \
|
||||
--url "http://$host:$PORT/transfer-job-manager/v1/jobs?limit=1" \
|
||||
--header "Authorization: Basic $AUTH")
|
||||
|
||||
if [ "$HTTP_CODE" -eq 200 ]; then
|
||||
echo "API auf $host ist ONLINE (Code 200)."
|
||||
break
|
||||
else
|
||||
echo "Warte auf API ($host)... Status: $HTTP_CODE. Retry in 10s."
|
||||
sleep 10
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
trigger_job() {
|
||||
local host=$1
|
||||
local dest_sys=$2
|
||||
local file_uri=$3
|
||||
local desc=$4
|
||||
|
||||
DATA=$(jq -n \
|
||||
--arg desc "$desc" \
|
||||
--arg dest_sys "$dest_sys" \
|
||||
--arg file "$file_uri" \
|
||||
'{
|
||||
"description": $desc,
|
||||
"schedule": "NOW",
|
||||
"destination_system": $dest_sys,
|
||||
"destination_share": "local_sync_destination",
|
||||
"source_file_uris": [$file],
|
||||
"wait_for_publish": false,
|
||||
"flat_file_mode_disabled": false
|
||||
}')
|
||||
|
||||
curl --request POST \
|
||||
--url "http://$host:$PORT/transfer-job-manager/v1/jobs" \
|
||||
--header "Authorization: Basic $AUTH" \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data "$DATA" --silent >/dev/null
|
||||
}
|
||||
|
||||
wait_for_api "$W0"
|
||||
|
||||
wait_for_api "$W1"
|
||||
wait_for_api "$W2"
|
||||
|
||||
echo "Alle Systeme bereit. Starte Baseline-Szenario Loop..."
|
||||
log_gt "SYSTEM_READY" "ALL" "Orchestrator started"
|
||||
INCOMING_DIR="/local_testdata/test-destination"
|
||||
|
||||
while true; do
|
||||
log_gt "START_LARGE" "W0" "W1"
|
||||
trigger_job "$W0" "Worker Node 1" "local_sync_source/5g.mxf" "Thesis Large File"
|
||||
sleep 60
|
||||
|
||||
log_gt "START_SMALL" "W0" "W2"
|
||||
for i in {1..10}; do
|
||||
trigger_job "$W0" "Worker Node 2" "local_sync_source/small_$i.dat" "Thesis Small File $i"
|
||||
done
|
||||
sleep 30
|
||||
|
||||
log_gt "START_INTERFERENCE" "W1" "W0"
|
||||
trigger_job "$W1" "Worker Node 0" "local_sync_source/5g.mxf" "Interference Load"
|
||||
|
||||
sleep 60
|
||||
rm -rf "${INCOMING_DIR:?}/"*
|
||||
done
|
||||
|
|
@ -0,0 +1,13 @@
|
|||
[Unit]
|
||||
Description=Thesis Watch Tool - Metrics Collector
|
||||
After=network.target
|
||||
|
||||
[Service]
|
||||
User=root
|
||||
WorkingDirectory=/opt/watch-tool
|
||||
ExecStart=/opt/watch-tool/watch-tool
|
||||
Restart=always
|
||||
RestartSec=5
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
Loading…
Add table
Add a link
Reference in a new issue