542 lines
17 KiB
Go
542 lines
17 KiB
Go
package collector
|
||
|
||
import (
|
||
"bufio"
|
||
"context"
|
||
"log"
|
||
"os"
|
||
"strconv"
|
||
"strings"
|
||
"sync"
|
||
"sync/atomic"
|
||
"time"
|
||
|
||
"codeberg.org/pata1704/guenther/pkg/types"
|
||
)
|
||
|
||
// MetricCollector samples Linux system metrics from /proc at a fixed interval
|
||
// and emits a types.MetricSnapshot for each sample.
|
||
//
|
||
// All /proc reads happen in the single collector goroutine, so no locking is
|
||
// required for the delta-state fields. The output channel uses a non-blocking
|
||
// send; overflows are counted in the dropped counter via load-shedding.
|
||
type MetricCollector struct {
|
||
outputChan chan<- types.MetricSnapshot
|
||
healthChan chan<- types.StageHealth
|
||
|
||
interval time.Duration
|
||
netInterface string
|
||
diskDevice string
|
||
|
||
wg sync.WaitGroup
|
||
|
||
// Delta state – only accessed from the single collector goroutine.
|
||
prevSoftnetDropped uint64
|
||
prevSoftnetSqueeze uint64
|
||
prevNetPacketsIn uint64
|
||
prevNetPacketsOut uint64
|
||
prevDiskReadsComp uint64
|
||
prevDiskWritesComp uint64
|
||
prevDiskRead uint64
|
||
prevDiskWrite uint64
|
||
prevDiskReadTimeMs uint64
|
||
prevDiskWriteTimeMs uint64
|
||
prevDiskIOTicks uint64
|
||
prevCPUTotal uint64
|
||
prevCPUIdle uint64
|
||
prevCPUIoWait uint64
|
||
prevCPUSoftIrq uint64
|
||
prevCtxt uint64
|
||
prevIntr uint64
|
||
prevNetIn uint64
|
||
prevNetOut uint64
|
||
prevNetErrs uint64
|
||
prevNetDrops uint64
|
||
prevTCPRetrans uint64
|
||
prevTCPTimeouts uint64
|
||
prevTCPLostRetrans uint64
|
||
prevTCPFastRetrans uint64
|
||
prevTime time.Time
|
||
firstSample bool
|
||
|
||
processed atomic.Uint64
|
||
dropped atomic.Uint64
|
||
}
|
||
|
||
func NewMetricCollector(
|
||
output chan<- types.MetricSnapshot,
|
||
health chan<- types.StageHealth,
|
||
interval time.Duration,
|
||
netIntf, diskDev string,
|
||
) *MetricCollector {
|
||
return &MetricCollector{
|
||
outputChan: output,
|
||
healthChan: health,
|
||
interval: interval,
|
||
netInterface: netIntf,
|
||
diskDevice: diskDev,
|
||
firstSample: true,
|
||
}
|
||
}
|
||
|
||
func (c *MetricCollector) Start(ctx context.Context) {
|
||
ticker := time.NewTicker(c.interval)
|
||
reportTicker := time.NewTicker(5 * time.Second)
|
||
c.prevTime = time.Now()
|
||
|
||
c.wg.Go(func() {
|
||
defer ticker.Stop()
|
||
defer reportTicker.Stop()
|
||
|
||
for {
|
||
select {
|
||
case <-ticker.C:
|
||
snap := c.collect()
|
||
if snap == nil {
|
||
continue
|
||
}
|
||
select {
|
||
case c.outputChan <- *snap:
|
||
c.processed.Add(1)
|
||
default:
|
||
c.dropped.Add(1)
|
||
}
|
||
|
||
case <-reportTicker.C:
|
||
c.emitHealth()
|
||
|
||
case <-ctx.Done():
|
||
return
|
||
}
|
||
}
|
||
})
|
||
}
|
||
|
||
// Wait waits for the collector goroutine to exit after context cancellation.
|
||
func (c *MetricCollector) Wait() {
|
||
c.wg.Wait()
|
||
}
|
||
|
||
// ── collection ────────────────────────────────────────────────────────────────
|
||
|
||
func (c *MetricCollector) collect() *types.MetricSnapshot {
|
||
now := time.Now()
|
||
duration := now.Sub(c.prevTime).Seconds()
|
||
|
||
cpuTotal, cpuIdle, cpuIowait, cpuSoftirq, ctxt, intr := c.readSystemStats()
|
||
memUsed, memCached, memDirty := c.readMemInfo()
|
||
netIn, netOut, netErrs, netDrops, rxPackets, txPackets := c.readNetDev()
|
||
retrans := c.readSNMPStats()
|
||
timeouts, lostRetrans, fastRetrans := c.readNetstat()
|
||
softDropped, softSqueeze := c.readSoftnetStat()
|
||
diskRead, diskWrite, diskReadTime, diskWriteTime, diskIOTicks, readsComp, writesComp := c.readDiskStats()
|
||
|
||
if c.firstSample {
|
||
c.storePrev(now,
|
||
cpuTotal, cpuIdle, cpuIowait, cpuSoftirq, ctxt, intr,
|
||
netIn, netOut, netErrs, netDrops, rxPackets, txPackets,
|
||
retrans, timeouts, lostRetrans, fastRetrans,
|
||
softDropped, softSqueeze,
|
||
diskRead, diskWrite, diskReadTime, diskWriteTime, diskIOTicks, readsComp, writesComp)
|
||
c.firstSample = false
|
||
return nil
|
||
}
|
||
|
||
if duration < 1e-6 {
|
||
duration = 1e-6
|
||
}
|
||
|
||
cpuDelta := saturatingSub(cpuTotal, c.prevCPUTotal)
|
||
cpuIdleDelta := saturatingSub(cpuIdle, c.prevCPUIdle)
|
||
cpuPercent, cpuIowaitPercent, cpuSoftirqPercent := 0.0, 0.0, 0.0
|
||
if cpuDelta > 0 {
|
||
cpuPercent = float64(cpuDelta-cpuIdleDelta) / float64(cpuDelta) * 100.0
|
||
cpuIowaitPercent = float64(saturatingSub(cpuIowait, c.prevCPUIoWait)) / float64(cpuDelta) * 100.0
|
||
cpuSoftirqPercent = float64(saturatingSub(cpuSoftirq, c.prevCPUSoftIrq)) / float64(cpuDelta) * 100.0
|
||
}
|
||
|
||
snap := &types.MetricSnapshot{
|
||
Timestamp: now,
|
||
CPUPercent: cpuPercent,
|
||
CPUIoWaitPercent: cpuIowaitPercent,
|
||
CPUSoftIrqPercent: cpuSoftirqPercent,
|
||
ContextSwitchesPerS: float64(saturatingSub(ctxt, c.prevCtxt)) / duration,
|
||
InterruptsPerS: float64(saturatingSub(intr, c.prevIntr)) / duration,
|
||
MemoryUsedMB: float64(memUsed),
|
||
MemoryCachedMB: float64(memCached),
|
||
MemoryDirtyMB: float64(memDirty),
|
||
NetworkInMBps: float64(saturatingSub(netIn, c.prevNetIn)) / duration / 1_048_576,
|
||
NetworkOutMBps: float64(saturatingSub(netOut, c.prevNetOut)) / duration / 1_048_576,
|
||
NetErrorsPerS: float64(saturatingSub(netErrs, c.prevNetErrs)) / duration,
|
||
NetDropsPerS: float64(saturatingSub(netDrops, c.prevNetDrops)) / duration,
|
||
TCPRetransPerS: float64(saturatingSub(retrans, c.prevTCPRetrans)) / duration,
|
||
TCPTimeoutsPerS: float64(saturatingSub(timeouts, c.prevTCPTimeouts)) / duration,
|
||
TCPLostRetransmitPerS: float64(saturatingSub(lostRetrans, c.prevTCPLostRetrans)) / duration,
|
||
TCPFastRetransPerS: float64(saturatingSub(fastRetrans, c.prevTCPFastRetrans)) / duration,
|
||
SoftnetDroppedPerS: float64(saturatingSub(softDropped, c.prevSoftnetDropped)) / duration,
|
||
SoftnetTimeSqueezePerS: float64(saturatingSub(softSqueeze, c.prevSoftnetSqueeze)) / duration,
|
||
DiskReadMBps: float64(saturatingSub(diskRead, c.prevDiskRead)) / duration / 1_048_576,
|
||
DiskWriteMBps: float64(saturatingSub(diskWrite, c.prevDiskWrite)) / duration / 1_048_576,
|
||
DiskReadTimeMsPerS: float64(saturatingSub(diskReadTime, c.prevDiskReadTimeMs)) / duration,
|
||
DiskWriteTimeMsPerS: float64(saturatingSub(diskWriteTime, c.prevDiskWriteTimeMs)) / duration,
|
||
DiskIOTicksPerS: float64(saturatingSub(diskIOTicks, c.prevDiskIOTicks)) / duration,
|
||
NetPacketsInPerS: float64(saturatingSub(rxPackets, c.prevNetPacketsIn)) / duration,
|
||
NetPacketsOutPerS: float64(saturatingSub(txPackets, c.prevNetPacketsOut)) / duration,
|
||
DiskReadsCompletedPerS: float64(saturatingSub(readsComp, c.prevDiskReadsComp)) / duration,
|
||
DiskWritesCompletedPerS: float64(saturatingSub(writesComp, c.prevDiskWritesComp)) / duration,
|
||
}
|
||
|
||
c.storePrev(now,
|
||
cpuTotal, cpuIdle, cpuIowait, cpuSoftirq, ctxt, intr,
|
||
netIn, netOut, netErrs, netDrops, rxPackets, txPackets,
|
||
retrans, timeouts, lostRetrans, fastRetrans,
|
||
softDropped, softSqueeze,
|
||
diskRead, diskWrite, diskReadTime, diskWriteTime, diskIOTicks, readsComp, writesComp)
|
||
return snap
|
||
}
|
||
|
||
func (c *MetricCollector) storePrev(
|
||
now time.Time,
|
||
cpuTotal, cpuIdle, cpuIowait, cpuSoftirq, ctxt, intr,
|
||
netIn, netOut, netErrs, netDrops, rxPackets, txPackets,
|
||
retrans, timeouts, lostRetrans, fastRetrans,
|
||
softDropped, softSqueeze,
|
||
diskRead, diskWrite, diskReadTime, diskWriteTime, diskIOTicks, readsComp, writesComp uint64,
|
||
) {
|
||
c.prevTime = now
|
||
c.prevCPUTotal = cpuTotal
|
||
c.prevCPUIdle = cpuIdle
|
||
c.prevCPUIoWait = cpuIowait
|
||
c.prevCPUSoftIrq = cpuSoftirq
|
||
c.prevCtxt = ctxt
|
||
c.prevIntr = intr
|
||
c.prevNetIn = netIn
|
||
c.prevNetOut = netOut
|
||
c.prevNetErrs = netErrs
|
||
c.prevNetDrops = netDrops
|
||
c.prevTCPRetrans = retrans
|
||
c.prevTCPTimeouts = timeouts
|
||
c.prevTCPLostRetrans = lostRetrans
|
||
c.prevTCPFastRetrans = fastRetrans
|
||
c.prevSoftnetDropped = softDropped
|
||
c.prevSoftnetSqueeze = softSqueeze
|
||
c.prevDiskRead = diskRead
|
||
c.prevDiskWrite = diskWrite
|
||
c.prevDiskReadTimeMs = diskReadTime
|
||
c.prevDiskWriteTimeMs = diskWriteTime
|
||
c.prevDiskIOTicks = diskIOTicks
|
||
c.prevNetPacketsIn = rxPackets
|
||
c.prevNetPacketsOut = txPackets
|
||
c.prevDiskReadsComp = readsComp
|
||
c.prevDiskWritesComp = writesComp
|
||
}
|
||
|
||
// ── /proc readers ─────────────────────────────────────────────────────────────
|
||
|
||
// readSystemStats reads /proc/stat and returns cumulative CPU jiffies
|
||
// (total, idle, iowait, softirq) plus cumulative context-switches and
|
||
// interrupt counts.
|
||
//
|
||
// /proc/stat CPU column layout:
|
||
//
|
||
// col 1=user 2=nice 3=system 4=idle 5=iowait 6=irq 7=softirq
|
||
func (c *MetricCollector) readSystemStats() (total, idle, iowait, softirq, ctxt, intr uint64) {
|
||
f, err := os.Open("/proc/stat")
|
||
if err != nil {
|
||
log.Printf("metric: open /proc/stat: %v", err)
|
||
return
|
||
}
|
||
defer f.Close()
|
||
|
||
scanner := bufio.NewScanner(f)
|
||
for scanner.Scan() {
|
||
fields := strings.Fields(scanner.Text())
|
||
if len(fields) == 0 {
|
||
continue
|
||
}
|
||
switch fields[0] {
|
||
case "cpu":
|
||
for i := 1; i < len(fields); i++ {
|
||
v, _ := strconv.ParseUint(fields[i], 10, 64)
|
||
total += v
|
||
switch i {
|
||
case 4:
|
||
idle = v
|
||
case 5:
|
||
iowait = v
|
||
case 7:
|
||
softirq = v
|
||
}
|
||
}
|
||
case "ctxt":
|
||
if len(fields) > 1 {
|
||
ctxt, _ = strconv.ParseUint(fields[1], 10, 64)
|
||
}
|
||
case "intr":
|
||
if len(fields) > 1 {
|
||
intr, _ = strconv.ParseUint(fields[1], 10, 64)
|
||
}
|
||
}
|
||
}
|
||
if err := scanner.Err(); err != nil {
|
||
log.Printf("metric: scan /proc/stat: %v", err)
|
||
}
|
||
return
|
||
}
|
||
|
||
func (c *MetricCollector) readMemInfo() (used, cached, dirty uint64) {
|
||
f, err := os.Open("/proc/meminfo")
|
||
if err != nil {
|
||
log.Printf("metric: open /proc/meminfo: %v", err)
|
||
return
|
||
}
|
||
defer f.Close()
|
||
|
||
var total, available uint64
|
||
scanner := bufio.NewScanner(f)
|
||
for scanner.Scan() {
|
||
fields := strings.Fields(scanner.Text())
|
||
if len(fields) < 2 {
|
||
continue
|
||
}
|
||
val, _ := strconv.ParseUint(fields[1], 10, 64)
|
||
switch fields[0] {
|
||
case "MemTotal:":
|
||
total = val
|
||
case "MemAvailable:":
|
||
available = val
|
||
case "Cached:":
|
||
cached = val / 1024 // kB → MB
|
||
case "Dirty:":
|
||
dirty = val / 1024 // kB → MB
|
||
}
|
||
}
|
||
if err := scanner.Err(); err != nil {
|
||
log.Printf("metric: scan /proc/meminfo: %v", err)
|
||
}
|
||
if total >= available {
|
||
used = (total - available) / 1024
|
||
}
|
||
return
|
||
}
|
||
|
||
// readNetDev reads /proc/net/dev for the configured interface.
|
||
//
|
||
// /proc/net/dev column layout (after stripping "iface:"):
|
||
//
|
||
// 0=rx_bytes 1=rx_packets 2=rx_errs 3=rx_drop
|
||
// 4=rx_fifo 5=rx_frame 6=rx_compressed 7=rx_multicast
|
||
// 8=tx_bytes 9=tx_packets 10=tx_errs 11=tx_drop ...
|
||
// 8=tx_bytes 9=tx_packets 10=tx_errs 11=tx_drop ...
|
||
func (c *MetricCollector) readNetDev() (rxBytes, txBytes, errs, drops, rxPackets, txPackets uint64) {
|
||
f, err := os.Open("/proc/net/dev")
|
||
if err != nil {
|
||
return 0, 0, 0, 0, 0, 0
|
||
}
|
||
defer f.Close()
|
||
|
||
prefix := c.netInterface + ":"
|
||
scanner := bufio.NewScanner(f)
|
||
for scanner.Scan() {
|
||
line := strings.TrimSpace(scanner.Text())
|
||
if !strings.HasPrefix(line, prefix) {
|
||
continue
|
||
}
|
||
line = strings.TrimPrefix(line, prefix)
|
||
fields := strings.Fields(line)
|
||
if len(fields) < 12 {
|
||
log.Printf("metric: unexpected /proc/net/dev format for %q", c.netInterface)
|
||
return 0, 0, 0, 0, 0, 0
|
||
}
|
||
rxBytes, _ = strconv.ParseUint(fields[0], 10, 64)
|
||
rxPackets, _ = strconv.ParseUint(fields[1], 10, 64)
|
||
rxErrs, _ := strconv.ParseUint(fields[2], 10, 64)
|
||
rxDrops, _ := strconv.ParseUint(fields[3], 10, 64)
|
||
txBytes, _ = strconv.ParseUint(fields[8], 10, 64)
|
||
txPackets, _ = strconv.ParseUint(fields[9], 10, 64)
|
||
txErrs, _ := strconv.ParseUint(fields[10], 10, 64)
|
||
txDrops, _ := strconv.ParseUint(fields[11], 10, 64)
|
||
return rxBytes, txBytes, rxErrs + txErrs, rxDrops + txDrops, rxPackets, txPackets
|
||
}
|
||
if err := scanner.Err(); err != nil {
|
||
log.Printf("metric: scan /proc/net/dev: %v", err)
|
||
}
|
||
return 0, 0, 0, 0, 0, 0
|
||
}
|
||
|
||
// readSNMPStats reads RetransSegs from /proc/net/snmp (Tcp section).
|
||
//
|
||
// /proc/net/snmp Tcp header order (kernel-stable):
|
||
//
|
||
// RtoAlgorithm RtoMin RtoMax MaxConn ActiveOpens PassiveOpens
|
||
// AttemptFails EstabResets CurrEstab InSegs OutSegs RetransSegs InErrs OutRsts
|
||
//
|
||
// RetransSegs is at index 12 (0-based) in the value row.
|
||
func (c *MetricCollector) readSNMPStats() uint64 {
|
||
f, err := os.Open("/proc/net/snmp")
|
||
if err != nil {
|
||
return 0
|
||
}
|
||
defer f.Close()
|
||
|
||
// The file alternates header/value rows for each protocol block.
|
||
// We need both rows to find RetransSegs by column name.
|
||
scanner := bufio.NewScanner(f)
|
||
var tcpHeader []string
|
||
for scanner.Scan() {
|
||
line := scanner.Text()
|
||
if !strings.HasPrefix(line, "Tcp:") {
|
||
continue
|
||
}
|
||
fields := strings.Fields(line)
|
||
if tcpHeader == nil {
|
||
tcpHeader = fields // first Tcp: line is the header
|
||
continue
|
||
}
|
||
// second Tcp: line is the values
|
||
for i, h := range tcpHeader {
|
||
if h == "RetransSegs" && i < len(fields) {
|
||
v, _ := strconv.ParseUint(fields[i], 10, 64)
|
||
return v
|
||
}
|
||
}
|
||
}
|
||
if err := scanner.Err(); err != nil {
|
||
log.Printf("metric: scan /proc/net/snmp: %v", err)
|
||
}
|
||
return 0
|
||
}
|
||
|
||
// readNetstat reads TCPTimeouts, TCPLostRetransmit and TCPFastRetrans from
|
||
// /proc/net/netstat (TcpExt section). The file alternates header/value rows.
|
||
func (c *MetricCollector) readNetstat() (timeouts, lostRetrans, fastRetrans uint64) {
|
||
f, err := os.Open("/proc/net/netstat")
|
||
if err != nil {
|
||
return 0, 0, 0
|
||
}
|
||
defer f.Close()
|
||
|
||
scanner := bufio.NewScanner(f)
|
||
var headers []string
|
||
for scanner.Scan() {
|
||
line := scanner.Text()
|
||
if !strings.HasPrefix(line, "TcpExt:") {
|
||
continue
|
||
}
|
||
fields := strings.Fields(line)
|
||
if headers == nil {
|
||
headers = fields
|
||
continue
|
||
}
|
||
// value row
|
||
for i, h := range headers {
|
||
if i >= len(fields) {
|
||
break
|
||
}
|
||
switch h {
|
||
case "TCPTimeouts":
|
||
timeouts, _ = strconv.ParseUint(fields[i], 10, 64)
|
||
case "TCPLostRetransmit":
|
||
lostRetrans, _ = strconv.ParseUint(fields[i], 10, 64)
|
||
case "TCPFastRetrans":
|
||
fastRetrans, _ = strconv.ParseUint(fields[i], 10, 64)
|
||
}
|
||
}
|
||
}
|
||
if err := scanner.Err(); err != nil {
|
||
log.Printf("metric: scan /proc/net/netstat: %v", err)
|
||
}
|
||
return
|
||
}
|
||
|
||
// readSoftnetStat reads /proc/net/softnet_stat and sums dropped and
|
||
// time_squeeze across all CPU columns (hex values).
|
||
func (c *MetricCollector) readSoftnetStat() (dropped, timeSqueeze uint64) {
|
||
f, err := os.Open("/proc/net/softnet_stat")
|
||
if err != nil {
|
||
return 0, 0
|
||
}
|
||
defer f.Close()
|
||
|
||
scanner := bufio.NewScanner(f)
|
||
for scanner.Scan() {
|
||
fields := strings.Fields(scanner.Text())
|
||
// col 0 = total, col 1 = dropped, col 2 = time_squeeze
|
||
if len(fields) >= 3 {
|
||
d, _ := strconv.ParseUint(fields[1], 16, 64)
|
||
t, _ := strconv.ParseUint(fields[2], 16, 64)
|
||
dropped += d
|
||
timeSqueeze += t
|
||
}
|
||
}
|
||
if err := scanner.Err(); err != nil {
|
||
log.Printf("metric: scan /proc/net/softnet_stat: %v", err)
|
||
}
|
||
return
|
||
}
|
||
|
||
// readDiskStats reads /proc/diskstats for the configured device.
|
||
//
|
||
// /proc/diskstats column layout (kernel ≥ 4.18):
|
||
//
|
||
// 0=major 1=minor 2=name
|
||
// 3=reads_completed 4=reads_merged 5=sectors_read 6=read_time_ms
|
||
// 7=writes_completed 8=writes_merged 9=sectors_written 10=write_time_ms
|
||
// 11=io_in_progress 12=io_ticks_ms 13=weighted_io_ticks
|
||
// 11=io_in_progress 12=io_ticks_ms 13=weighted_io_ticks
|
||
func (c *MetricCollector) readDiskStats() (readBytes, writeBytes, readTimeMs, writeTimeMs, ioTicks, readsComp, writesComp uint64) {
|
||
f, err := os.Open("/proc/diskstats")
|
||
if err != nil {
|
||
log.Printf("metric: open /proc/diskstats: %v", err)
|
||
return
|
||
}
|
||
defer f.Close()
|
||
|
||
scanner := bufio.NewScanner(f)
|
||
for scanner.Scan() {
|
||
fields := strings.Fields(scanner.Text())
|
||
if len(fields) < 14 || fields[2] != c.diskDevice {
|
||
continue
|
||
}
|
||
readsComp, _ = strconv.ParseUint(fields[3], 10, 64)
|
||
writesComp, _ = strconv.ParseUint(fields[7], 10, 64)
|
||
rSectors, _ := strconv.ParseUint(fields[5], 10, 64)
|
||
wSectors, _ := strconv.ParseUint(fields[9], 10, 64)
|
||
rTime, _ := strconv.ParseUint(fields[6], 10, 64)
|
||
wTime, _ := strconv.ParseUint(fields[10], 10, 64)
|
||
ticks, _ := strconv.ParseUint(fields[12], 10, 64)
|
||
return rSectors * 512, wSectors * 512, rTime, wTime, ticks, readsComp, writesComp
|
||
}
|
||
if err := scanner.Err(); err != nil {
|
||
log.Printf("metric: scan /proc/diskstats: %v", err)
|
||
}
|
||
return
|
||
}
|
||
|
||
// ── health ────────────────────────────────────────────────────────────────────
|
||
|
||
func (c *MetricCollector) emitHealth() {
|
||
p := c.processed.Load()
|
||
d := c.dropped.Load()
|
||
select {
|
||
case c.healthChan <- types.StageHealth{
|
||
StageName: "metric_collector",
|
||
EventsProcessed: p,
|
||
EventsDropped: d,
|
||
Throughput: float64(p) / 5.0,
|
||
LastUpdate: time.Now(),
|
||
}:
|
||
default:
|
||
}
|
||
}
|
||
|
||
// ── helpers ───────────────────────────────────────────────────────────────────
|
||
|
||
// saturatingSub returns a − b, clamped to 0 on underflow.
|
||
// 64-bit /proc counters very rarely wrap, but saturation prevents negative rates.
|
||
func saturatingSub(a, b uint64) uint64 {
|
||
if a >= b {
|
||
return a - b
|
||
}
|
||
return 0
|
||
}
|