feat(system-metrics,service-metrics,elastic): update elastic-client to version 8, improve parser logic to add more information, improve system monitor to add more information
This commit is contained in:
parent
6c098ed61c
commit
159df116c8
10 changed files with 587 additions and 190 deletions
|
|
@ -4,28 +4,41 @@ import (
|
|||
"context"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"net"
|
||||
"os"
|
||||
"slices"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
"github.com/elastic/go-elasticsearch/v7"
|
||||
"github.com/elastic/go-elasticsearch/v8"
|
||||
"github.com/shirou/gopsutil/cpu"
|
||||
"github.com/shirou/gopsutil/disk"
|
||||
"github.com/shirou/gopsutil/host"
|
||||
"github.com/shirou/gopsutil/load"
|
||||
"github.com/shirou/gopsutil/mem"
|
||||
"github.com/shirou/gopsutil/net"
|
||||
psnet "github.com/shirou/gopsutil/net"
|
||||
"github.com/shirou/gopsutil/process"
|
||||
"golang.org/x/sys/unix"
|
||||
)
|
||||
|
||||
type SystemMetricsCollector struct {
|
||||
config SystemMetrics
|
||||
pollInterval int
|
||||
config SystemMetrics
|
||||
pollInterval int
|
||||
lastNetworkStats map[string]NetworkStat
|
||||
lastDiskStats map[string]DiskIOStat
|
||||
lastMeasureTime time.Time
|
||||
}
|
||||
|
||||
func NewSystemMetricsCollector(config SystemMetrics, pollInterval int) *SystemMetricsCollector {
|
||||
return &SystemMetricsCollector{
|
||||
config: config,
|
||||
pollInterval: pollInterval,
|
||||
config: config,
|
||||
pollInterval: pollInterval,
|
||||
lastNetworkStats: make(map[string]NetworkStat),
|
||||
lastDiskStats: make(map[string]DiskIOStat),
|
||||
lastMeasureTime: time.Now(),
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -88,10 +101,306 @@ func (smc *SystemMetricsCollector) collectMetrics() (SystemResources, error) {
|
|||
slog.Warn("failed to collect process metrics", "error", err)
|
||||
}
|
||||
}
|
||||
if smc.config.CollectDiskIO {
|
||||
if err = smc.collectDiskIOMetrics(&result); err != nil {
|
||||
slog.Warn("failed to collect disk IO metrics", "error", err)
|
||||
}
|
||||
}
|
||||
|
||||
if smc.config.CollectNetworkConnections {
|
||||
if err = smc.collectNetworkConnections(&result); err != nil {
|
||||
slog.Warn("failed to collect network connections", "error", err)
|
||||
}
|
||||
}
|
||||
|
||||
if smc.config.CollectLoadAverage {
|
||||
if err = smc.collectLoadAverage(&result); err != nil {
|
||||
slog.Warn("failed to collect load average", "error", err)
|
||||
}
|
||||
}
|
||||
|
||||
if smc.config.CollectTCPStats {
|
||||
if err = smc.collectTCPStats(&result); err != nil {
|
||||
slog.Warn("failed to collect TCP stats", "error", err)
|
||||
}
|
||||
}
|
||||
|
||||
if smc.config.CollectNetworkLatency {
|
||||
if err = smc.collectNetworkLatency(&result); err != nil {
|
||||
slog.Warn("failed to collect network latency", "error", err)
|
||||
}
|
||||
}
|
||||
|
||||
if smc.config.CollectBandwidthUsage {
|
||||
if err = smc.collectBandwidthUsage(&result); err != nil {
|
||||
slog.Warn("failed to collect bandwidth usage", "error", err)
|
||||
}
|
||||
}
|
||||
|
||||
if smc.config.CollectFileHandles {
|
||||
if err = smc.collectSystemLimits(&result); err != nil {
|
||||
slog.Warn("failed to collect system limits", "error", err)
|
||||
}
|
||||
}
|
||||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func (smc *SystemMetricsCollector) collectDiskIOMetrics(result *SystemResources) error {
|
||||
diskIOStats, err := disk.IOCounters()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
currentTime := time.Now()
|
||||
timeDiff := currentTime.Sub(smc.lastMeasureTime).Seconds()
|
||||
|
||||
result.DiskIOStats = make(map[string]DiskIOStat)
|
||||
|
||||
for device, stats := range diskIOStats {
|
||||
ioStat := DiskIOStat{
|
||||
ReadBytes: stats.ReadBytes,
|
||||
WriteBytes: stats.WriteBytes,
|
||||
ReadOps: stats.ReadCount,
|
||||
WriteOps: stats.WriteCount,
|
||||
ReadTime: stats.ReadTime,
|
||||
WriteTime: stats.WriteTime,
|
||||
}
|
||||
|
||||
if stats.ReadCount > 0 {
|
||||
ioStat.AvgReadLatency = float64(stats.ReadTime) / float64(stats.ReadCount)
|
||||
}
|
||||
if stats.WriteCount > 0 {
|
||||
ioStat.AvgWriteLatency = float64(stats.WriteTime) / float64(stats.WriteCount)
|
||||
}
|
||||
|
||||
if timeDiff > 0 {
|
||||
totalTime := float64(stats.ReadTime + stats.WriteTime)
|
||||
ioStat.IOUtilization = (totalTime / (timeDiff * 1000)) * 100
|
||||
if ioStat.IOUtilization > 100 {
|
||||
ioStat.IOUtilization = 100
|
||||
}
|
||||
}
|
||||
|
||||
result.DiskIOStats[device] = ioStat
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (smc *SystemMetricsCollector) collectNetworkConnections(result *SystemResources) error {
|
||||
connections, err := psnet.Connections("all")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
stats := ConnectionStats{
|
||||
ConnectionsByState: make(map[string]int32),
|
||||
}
|
||||
|
||||
for _, conn := range connections {
|
||||
stats.TotalConnections++
|
||||
|
||||
stats.ConnectionsByState[conn.Status]++
|
||||
|
||||
switch conn.Status {
|
||||
case "ESTABLISHED":
|
||||
stats.EstablishedTCP++
|
||||
case "LISTEN":
|
||||
stats.ListeningTCP++
|
||||
case "TIME_WAIT":
|
||||
stats.TimeWaitTCP++
|
||||
}
|
||||
|
||||
if slices.Contains(smc.config.TransferPorts, int(conn.Laddr.Port)) ||
|
||||
slices.Contains(smc.config.TransferPorts, int(conn.Raddr.Port)) {
|
||||
stats.TransferConnections++
|
||||
}
|
||||
}
|
||||
|
||||
result.NetworkConnections = stats
|
||||
return nil
|
||||
}
|
||||
|
||||
func (smc *SystemMetricsCollector) collectLoadAverage(result *SystemResources) error {
|
||||
loadAvg, err := load.Avg()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
result.LoadAverage = append(result.LoadAverage, loadAvg.Load1)
|
||||
result.LoadAverage = append(result.LoadAverage, loadAvg.Load5)
|
||||
result.LoadAverage = append(result.LoadAverage, loadAvg.Load15)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (smc *SystemMetricsCollector) collectTCPStats(result *SystemResources) error {
|
||||
tcpStats := TCPStatistics{}
|
||||
|
||||
if data, err := os.ReadFile("/proc/net/netstat"); err == nil {
|
||||
content := string(data)
|
||||
lines := strings.SplitSeq(content, "\n")
|
||||
for line := range lines {
|
||||
if strings.HasPrefix(line, "TcpExt:") {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
result.TCPStats = tcpStats
|
||||
return nil
|
||||
}
|
||||
|
||||
func (smc *SystemMetricsCollector) collectNetworkLatency(result *SystemResources) error {
|
||||
result.NetworkLatency = make(map[string]LatencyInfo)
|
||||
|
||||
for _, host := range smc.config.LatencyTestHosts {
|
||||
latency := smc.measureLatency(host)
|
||||
result.NetworkLatency[host] = latency
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (smc *SystemMetricsCollector) measureLatency(host string) LatencyInfo {
|
||||
var latencies []time.Duration
|
||||
var successful int
|
||||
|
||||
for range 5 {
|
||||
start := time.Now()
|
||||
conn, err := net.DialTimeout("tcp", host+":80", 3*time.Second)
|
||||
if err == nil {
|
||||
latency := time.Since(start)
|
||||
latencies = append(latencies, latency)
|
||||
conn.Close()
|
||||
successful++
|
||||
}
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
}
|
||||
|
||||
if len(latencies) == 0 {
|
||||
return LatencyInfo{Host: host, PacketLoss: 100.0}
|
||||
}
|
||||
|
||||
var total time.Duration
|
||||
min := latencies[0]
|
||||
max := latencies[0]
|
||||
|
||||
for _, lat := range latencies {
|
||||
total += lat
|
||||
if lat < min {
|
||||
min = lat
|
||||
}
|
||||
if lat > max {
|
||||
max = lat
|
||||
}
|
||||
}
|
||||
|
||||
avg := total / time.Duration(len(latencies))
|
||||
packetLoss := float64(5-successful) / 5.0 * 100.0
|
||||
jitter := max - min
|
||||
|
||||
return LatencyInfo{
|
||||
Host: host,
|
||||
MinLatency: min,
|
||||
MaxLatency: max,
|
||||
AvgLatency: avg,
|
||||
PacketLoss: packetLoss,
|
||||
Jitter: jitter,
|
||||
}
|
||||
}
|
||||
|
||||
func (smc *SystemMetricsCollector) collectBandwidthUsage(result *SystemResources) error {
|
||||
netStats, err := psnet.IOCounters(true)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
result.BandwidthUtilization = make(map[string]BandwidthInfo)
|
||||
currentTime := time.Now()
|
||||
timeDiff := currentTime.Sub(smc.lastMeasureTime).Seconds()
|
||||
|
||||
for _, stat := range netStats {
|
||||
if len(smc.config.NetworkInterfaces) > 0 &&
|
||||
!slices.Contains(smc.config.NetworkInterfaces, stat.Name) {
|
||||
continue
|
||||
}
|
||||
|
||||
bandwidth := BandwidthInfo{Interface: stat.Name}
|
||||
|
||||
if lastStat, exists := smc.lastNetworkStats[stat.Name]; exists && timeDiff > 0 {
|
||||
bytesDiffIn := float64(stat.BytesRecv - lastStat.BytesRecv)
|
||||
bytesDiffOut := float64(stat.BytesSent - lastStat.BytesSent)
|
||||
|
||||
bandwidth.CurrentThroughputIn = (bytesDiffIn / timeDiff) / (1024 * 1024) // MB/s
|
||||
bandwidth.CurrentThroughputOut = (bytesDiffOut / timeDiff) / (1024 * 1024)
|
||||
|
||||
bandwidth.PeakThroughputIn = bandwidth.CurrentThroughputIn
|
||||
bandwidth.PeakThroughputOut = bandwidth.CurrentThroughputOut
|
||||
|
||||
linkCapacityMbps := 1000.0
|
||||
totalThroughput := bandwidth.CurrentThroughputIn + bandwidth.CurrentThroughputOut
|
||||
bandwidth.UtilizationPercent = (totalThroughput / linkCapacityMbps) * 100
|
||||
}
|
||||
|
||||
result.BandwidthUtilization[stat.Name] = bandwidth
|
||||
}
|
||||
|
||||
for _, stat := range netStats {
|
||||
smc.lastNetworkStats[stat.Name] = NetworkStat{
|
||||
BytesSent: stat.BytesSent,
|
||||
BytesRecv: stat.BytesRecv,
|
||||
PacketsSent: stat.PacketsSent,
|
||||
PacketsRecv: stat.PacketsRecv,
|
||||
}
|
||||
}
|
||||
|
||||
smc.lastMeasureTime = currentTime
|
||||
return nil
|
||||
}
|
||||
|
||||
func (smc *SystemMetricsCollector) collectSystemLimits(result *SystemResources) error {
|
||||
limits := SystemLimitInfo{}
|
||||
|
||||
if data, err := os.ReadFile("/proc/sys/fs/file-max"); err == nil {
|
||||
if maxFiles, err := strconv.ParseUint(strings.TrimSpace(string(data)), 10, 64); err == nil {
|
||||
limits.MaxOpenFiles = maxFiles
|
||||
}
|
||||
}
|
||||
|
||||
if data, err := os.ReadFile("/proc/sys/fs/file-nr"); err == nil {
|
||||
fields := strings.Fields(string(data))
|
||||
if len(fields) >= 1 {
|
||||
if currentFiles, err := strconv.ParseUint(fields[0], 10, 64); err == nil {
|
||||
limits.CurrentOpenFiles = currentFiles
|
||||
if limits.MaxOpenFiles > 0 {
|
||||
limits.FileDescriptorUsage = float64(currentFiles) / float64(limits.MaxOpenFiles) * 100
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var rlimit syscall.Rlimit
|
||||
if err := syscall.Getrlimit(unix.RLIMIT_NPROC, &rlimit); err == nil {
|
||||
limits.MaxProcesses = rlimit.Max
|
||||
}
|
||||
|
||||
result.SystemLimits = limits
|
||||
return nil
|
||||
}
|
||||
|
||||
// // Hilfsfunktionen
|
||||
// func NewSystemResources() SystemResources {
|
||||
// return SystemResources{
|
||||
// Timestamp: time.Now(),
|
||||
// DiskUsage: make(map[string]DiskUsage),
|
||||
// DiskIOStats: make(map[string]DiskIOStat),
|
||||
// NetworkStats: make(map[string]NetworkStat),
|
||||
// NetworkLatency: make(map[string]LatencyInfo),
|
||||
// BandwidthUtilization: make(map[string]BandwidthInfo),
|
||||
// }
|
||||
// }
|
||||
|
||||
func (smc *SystemMetricsCollector) collectProcessMetrics(result *SystemResources) error {
|
||||
processes, err := process.Processes()
|
||||
if err != nil {
|
||||
|
|
@ -206,7 +515,7 @@ func (smc *SystemMetricsCollector) collectDiskMetrics(result *SystemResources) e
|
|||
}
|
||||
|
||||
func (smc *SystemMetricsCollector) collectNetworkMetrics(result *SystemResources) error {
|
||||
netStats, err := net.IOCounters(true)
|
||||
netStats, err := psnet.IOCounters(true)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue