watch-tool/system_metrics.go

523 lines
12 KiB
Go

package main
import (
"context"
"fmt"
"log/slog"
"net"
"os"
"slices"
"sort"
"strconv"
"strings"
"syscall"
"time"
"github.com/elastic/go-elasticsearch/v8"
"github.com/shirou/gopsutil/cpu"
"github.com/shirou/gopsutil/disk"
"github.com/shirou/gopsutil/host"
"github.com/shirou/gopsutil/load"
"github.com/shirou/gopsutil/mem"
psnet "github.com/shirou/gopsutil/net"
"github.com/shirou/gopsutil/process"
"golang.org/x/sys/unix"
)
type SystemMetricsCollector struct {
config SystemMetrics
pollInterval int
lastNetworkStats map[string]NetworkStat
lastDiskStats map[string]DiskIOStat
lastMeasureTime time.Time
}
func NewSystemMetricsCollector(config SystemMetrics, pollInterval int) *SystemMetricsCollector {
return &SystemMetricsCollector{
config: config,
pollInterval: pollInterval,
lastNetworkStats: make(map[string]NetworkStat),
lastDiskStats: make(map[string]DiskIOStat),
lastMeasureTime: time.Now(),
}
}
func (smc *SystemMetricsCollector) Start(ctx context.Context, es *elasticsearch.Client, baseIndex string) {
ticker := time.NewTicker(time.Duration(smc.pollInterval) * time.Second)
defer ticker.Stop()
sender := NewElasticsearchSender(es)
for {
select {
case <-ctx.Done():
slog.Info("System metrics collector stopped")
return
case <-ticker.C:
metrics, err := smc.collectMetrics()
if err != nil {
slog.Error("error collecting system metrics", "error", err)
continue
}
if err := sender.SendSystemMetrics(baseIndex, metrics); err != nil {
slog.Error("error sending system metrics", "error", err)
}
}
}
}
func (smc *SystemMetricsCollector) collectMetrics() (SystemResources, error) {
result := NewSystemResources()
var err error
if smc.config.CollectCPU {
if err = smc.collectCPUMetrics(&result); err != nil {
return result, fmt.Errorf("CPU metrics: %w", err)
}
}
if smc.config.CollectMemory {
if err = smc.collectMemoryMetrics(&result); err != nil {
return result, fmt.Errorf("memory metrics: %w", err)
}
}
if smc.config.CollectDisk {
if err = smc.collectDiskMetrics(&result); err != nil {
return result, fmt.Errorf("disk metrics: %w", err)
}
}
if smc.config.CollectNetwork {
if err = smc.collectNetworkMetrics(&result); err != nil {
return result, fmt.Errorf("network metrics: %w", err)
}
}
if smc.config.CollectProcesses {
if err := smc.collectProcessMetrics(&result); err != nil {
slog.Warn("failed to collect process metrics", "error", err)
}
}
if smc.config.CollectDiskIO {
if err = smc.collectDiskIOMetrics(&result); err != nil {
slog.Warn("failed to collect disk IO metrics", "error", err)
}
}
if smc.config.CollectNetworkConnections {
if err = smc.collectNetworkConnections(&result); err != nil {
slog.Warn("failed to collect network connections", "error", err)
}
}
if smc.config.CollectLoadAverage {
if err = smc.collectLoadAverage(&result); err != nil {
slog.Warn("failed to collect load average", "error", err)
}
}
if smc.config.CollectTCPStats {
if err = smc.collectTCPStats(&result); err != nil {
slog.Warn("failed to collect TCP stats", "error", err)
}
}
if smc.config.CollectNetworkLatency {
if err = smc.collectNetworkLatency(&result); err != nil {
slog.Warn("failed to collect network latency", "error", err)
}
}
if smc.config.CollectBandwidthUsage {
if err = smc.collectBandwidthUsage(&result); err != nil {
slog.Warn("failed to collect bandwidth usage", "error", err)
}
}
if smc.config.CollectFileHandles {
if err = smc.collectSystemLimits(&result); err != nil {
slog.Warn("failed to collect system limits", "error", err)
}
}
return result, nil
}
func (smc *SystemMetricsCollector) collectDiskIOMetrics(result *SystemResources) error {
diskIOStats, err := disk.IOCounters()
if err != nil {
return err
}
currentTime := time.Now()
timeDiff := currentTime.Sub(smc.lastMeasureTime).Seconds()
result.DiskIOStats = make(map[string]DiskIOStat)
for device, stats := range diskIOStats {
ioStat := DiskIOStat{
ReadBytes: stats.ReadBytes,
WriteBytes: stats.WriteBytes,
ReadOps: stats.ReadCount,
WriteOps: stats.WriteCount,
ReadTime: stats.ReadTime,
WriteTime: stats.WriteTime,
}
if stats.ReadCount > 0 {
ioStat.AvgReadLatency = float64(stats.ReadTime) / float64(stats.ReadCount)
}
if stats.WriteCount > 0 {
ioStat.AvgWriteLatency = float64(stats.WriteTime) / float64(stats.WriteCount)
}
if timeDiff > 0 {
totalTime := float64(stats.ReadTime + stats.WriteTime)
ioStat.IOUtilization = (totalTime / (timeDiff * 1000)) * 100
if ioStat.IOUtilization > 100 {
ioStat.IOUtilization = 100
}
}
result.DiskIOStats[device] = ioStat
}
return nil
}
func (smc *SystemMetricsCollector) collectNetworkConnections(result *SystemResources) error {
connections, err := psnet.Connections("all")
if err != nil {
return err
}
stats := ConnectionStats{
ConnectionsByState: make(map[string]int32),
}
for _, conn := range connections {
stats.TotalConnections++
stats.ConnectionsByState[conn.Status]++
switch conn.Status {
case "ESTABLISHED":
stats.EstablishedTCP++
case "LISTEN":
stats.ListeningTCP++
case "TIME_WAIT":
stats.TimeWaitTCP++
}
if slices.Contains(smc.config.TransferPorts, int(conn.Laddr.Port)) ||
slices.Contains(smc.config.TransferPorts, int(conn.Raddr.Port)) {
stats.TransferConnections++
}
}
result.NetworkConnections = stats
return nil
}
func (smc *SystemMetricsCollector) collectLoadAverage(result *SystemResources) error {
loadAvg, err := load.Avg()
if err != nil {
return err
}
result.LoadAverage = append(result.LoadAverage, loadAvg.Load1)
result.LoadAverage = append(result.LoadAverage, loadAvg.Load5)
result.LoadAverage = append(result.LoadAverage, loadAvg.Load15)
return nil
}
func (smc *SystemMetricsCollector) collectTCPStats(result *SystemResources) error {
tcpStats := TCPStatistics{}
if data, err := os.ReadFile("/proc/net/netstat"); err == nil {
content := string(data)
lines := strings.SplitSeq(content, "\n")
for line := range lines {
if strings.HasPrefix(line, "TcpExt:") {
}
}
}
result.TCPStats = tcpStats
return nil
}
func (smc *SystemMetricsCollector) collectNetworkLatency(result *SystemResources) error {
result.NetworkLatency = make(map[string]LatencyInfo)
for _, host := range smc.config.LatencyTestHosts {
latency := smc.measureLatency(host)
result.NetworkLatency[host] = latency
}
return nil
}
func (smc *SystemMetricsCollector) measureLatency(host string) LatencyInfo {
var latencies []time.Duration
var successful int
for range 5 {
start := time.Now()
conn, err := net.DialTimeout("tcp", host+":80", 3*time.Second)
if err == nil {
latency := time.Since(start)
latencies = append(latencies, latency)
conn.Close()
successful++
}
time.Sleep(100 * time.Millisecond)
}
if len(latencies) == 0 {
return LatencyInfo{Host: host, PacketLoss: 100.0}
}
var total time.Duration
min := latencies[0]
max := latencies[0]
for _, lat := range latencies {
total += lat
if lat < min {
min = lat
}
if lat > max {
max = lat
}
}
avg := total / time.Duration(len(latencies))
packetLoss := float64(5-successful) / 5.0 * 100.0
jitter := max - min
return LatencyInfo{
Host: host,
MinLatency: min,
MaxLatency: max,
AvgLatency: avg,
PacketLoss: packetLoss,
Jitter: jitter,
}
}
func (smc *SystemMetricsCollector) collectBandwidthUsage(result *SystemResources) error {
netStats, err := psnet.IOCounters(true)
if err != nil {
return err
}
result.BandwidthUtilization = make(map[string]BandwidthInfo)
currentTime := time.Now()
timeDiff := currentTime.Sub(smc.lastMeasureTime).Seconds()
for _, stat := range netStats {
if len(smc.config.NetworkInterfaces) > 0 &&
!slices.Contains(smc.config.NetworkInterfaces, stat.Name) {
continue
}
bandwidth := BandwidthInfo{Interface: stat.Name}
if lastStat, exists := smc.lastNetworkStats[stat.Name]; exists && timeDiff > 0 {
bytesDiffIn := float64(stat.BytesRecv - lastStat.BytesRecv)
bytesDiffOut := float64(stat.BytesSent - lastStat.BytesSent)
bandwidth.CurrentThroughputIn = (bytesDiffIn / timeDiff) / (1024 * 1024) // MB/s
bandwidth.CurrentThroughputOut = (bytesDiffOut / timeDiff) / (1024 * 1024)
bandwidth.PeakThroughputIn = bandwidth.CurrentThroughputIn
bandwidth.PeakThroughputOut = bandwidth.CurrentThroughputOut
linkCapacityMbps := 1000.0
totalThroughput := bandwidth.CurrentThroughputIn + bandwidth.CurrentThroughputOut
bandwidth.UtilizationPercent = (totalThroughput / linkCapacityMbps) * 100
}
result.BandwidthUtilization[stat.Name] = bandwidth
}
for _, stat := range netStats {
smc.lastNetworkStats[stat.Name] = NetworkStat{
BytesSent: stat.BytesSent,
BytesRecv: stat.BytesRecv,
PacketsSent: stat.PacketsSent,
PacketsRecv: stat.PacketsRecv,
}
}
smc.lastMeasureTime = currentTime
return nil
}
func (smc *SystemMetricsCollector) collectSystemLimits(result *SystemResources) error {
limits := SystemLimitInfo{}
if data, err := os.ReadFile("/proc/sys/fs/file-max"); err == nil {
if maxFiles, err := strconv.ParseUint(strings.TrimSpace(string(data)), 10, 64); err == nil {
limits.MaxOpenFiles = maxFiles
}
}
if data, err := os.ReadFile("/proc/sys/fs/file-nr"); err == nil {
fields := strings.Fields(string(data))
if len(fields) >= 1 {
if currentFiles, err := strconv.ParseUint(fields[0], 10, 64); err == nil {
limits.CurrentOpenFiles = currentFiles
if limits.MaxOpenFiles > 0 {
limits.FileDescriptorUsage = float64(currentFiles) / float64(limits.MaxOpenFiles) * 100
}
}
}
}
var rlimit syscall.Rlimit
if err := syscall.Getrlimit(unix.RLIMIT_NPROC, &rlimit); err == nil {
limits.MaxProcesses = rlimit.Max
}
result.SystemLimits = limits
return nil
}
func (smc *SystemMetricsCollector) collectProcessMetrics(result *SystemResources) error {
processes, err := process.Processes()
if err != nil {
return err
}
var processInfos []ProcessInfo
var totalOpenFiles int32
for _, p := range processes {
name, err := p.Name()
if err != nil {
continue
}
cpuPercent, err := p.CPUPercent()
if err != nil {
continue
}
memInfo, err := p.MemoryInfo()
if err != nil {
continue
}
status, err := p.Status()
if err != nil {
status = ""
}
createTime, err := p.CreateTime()
if err != nil {
createTime = 0
}
if openFiles, err := p.NumFDs(); err == nil {
totalOpenFiles += openFiles
}
processInfos = append(processInfos, ProcessInfo{
PID: p.Pid,
Name: name,
CPUPercent: cpuPercent,
MemoryMB: float32(memInfo.RSS) / 1024 / 1024,
Status: status,
CreateTime: createTime,
})
}
sort.Slice(processInfos, func(i, j int) bool {
return processInfos[i].CPUPercent > processInfos[j].CPUPercent
})
limit := smc.config.TopProcessesLimit
if len(processInfos) > limit {
processInfos = processInfos[:limit]
}
result.TopProcesses = processInfos
result.OpenFileDescriptors = totalOpenFiles
return nil
}
func (smc *SystemMetricsCollector) collectCPUMetrics(result *SystemResources) error {
cpuPercents, err := cpu.Percent(time.Second, false)
if err != nil {
return err
}
if len(cpuPercents) > 0 {
result.CPUPercent = cpuPercents[0]
}
if hostStat, err := host.Info(); err == nil {
result.Uptime = hostStat.Uptime
}
return nil
}
func (smc *SystemMetricsCollector) collectMemoryMetrics(result *SystemResources) error {
vmStat, err := mem.VirtualMemory()
if err != nil {
return err
}
result.MemoryUsed = vmStat.Used
result.MemoryTotal = vmStat.Total
result.MemoryPercent = vmStat.UsedPercent
return nil
}
func (smc *SystemMetricsCollector) collectDiskMetrics(result *SystemResources) error {
for _, path := range smc.config.DiskPaths {
diskStat, err := disk.Usage(path)
if err != nil {
slog.Error("error reading disk stats", "path", path, "error", err)
continue
}
result.DiskUsage[path] = DiskUsage{
Used: diskStat.Used,
Total: diskStat.Total,
UsedPercent: diskStat.UsedPercent,
Free: diskStat.Free,
}
}
return nil
}
func (smc *SystemMetricsCollector) collectNetworkMetrics(result *SystemResources) error {
netStats, err := psnet.IOCounters(true)
if err != nil {
return err
}
for _, stat := range netStats {
if len(smc.config.NetworkInterfaces) == 0 || slices.Contains(smc.config.NetworkInterfaces, stat.Name) {
result.NetworkStats[stat.Name] = NetworkStat{
BytesSent: stat.BytesSent,
BytesRecv: stat.BytesRecv,
PacketsSent: stat.PacketsSent,
PacketsRecv: stat.PacketsRecv,
}
}
}
return nil
}