guenther/internal/collector/metric.go

542 lines
17 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package collector
import (
"bufio"
"context"
"log"
"os"
"strconv"
"strings"
"sync"
"sync/atomic"
"time"
"codeberg.org/pata1704/guenther/pkg/types"
)
// MetricCollector samples Linux system metrics from /proc at a fixed interval
// and emits a types.MetricSnapshot for each sample.
//
// All /proc reads happen in the single collector goroutine, so no locking is
// required for the delta-state fields. The output channel uses a non-blocking
// send; overflows are counted in the dropped counter via load-shedding.
type MetricCollector struct {
outputChan chan<- types.MetricSnapshot
healthChan chan<- types.StageHealth
interval time.Duration
netInterface string
diskDevice string
wg sync.WaitGroup
// Delta state only accessed from the single collector goroutine.
prevSoftnetDropped uint64
prevSoftnetSqueeze uint64
prevNetPacketsIn uint64
prevNetPacketsOut uint64
prevDiskReadsComp uint64
prevDiskWritesComp uint64
prevDiskRead uint64
prevDiskWrite uint64
prevDiskReadTimeMs uint64
prevDiskWriteTimeMs uint64
prevDiskIOTicks uint64
prevCPUTotal uint64
prevCPUIdle uint64
prevCPUIoWait uint64
prevCPUSoftIrq uint64
prevCtxt uint64
prevIntr uint64
prevNetIn uint64
prevNetOut uint64
prevNetErrs uint64
prevNetDrops uint64
prevTCPRetrans uint64
prevTCPTimeouts uint64
prevTCPLostRetrans uint64
prevTCPFastRetrans uint64
prevTime time.Time
firstSample bool
processed atomic.Uint64
dropped atomic.Uint64
}
func NewMetricCollector(
output chan<- types.MetricSnapshot,
health chan<- types.StageHealth,
interval time.Duration,
netIntf, diskDev string,
) *MetricCollector {
return &MetricCollector{
outputChan: output,
healthChan: health,
interval: interval,
netInterface: netIntf,
diskDevice: diskDev,
firstSample: true,
}
}
func (c *MetricCollector) Start(ctx context.Context) {
ticker := time.NewTicker(c.interval)
reportTicker := time.NewTicker(5 * time.Second)
c.prevTime = time.Now()
c.wg.Go(func() {
defer ticker.Stop()
defer reportTicker.Stop()
for {
select {
case <-ticker.C:
snap := c.collect()
if snap == nil {
continue
}
select {
case c.outputChan <- *snap:
c.processed.Add(1)
default:
c.dropped.Add(1)
}
case <-reportTicker.C:
c.emitHealth()
case <-ctx.Done():
return
}
}
})
}
// Wait waits for the collector goroutine to exit after context cancellation.
func (c *MetricCollector) Wait() {
c.wg.Wait()
}
// ── collection ────────────────────────────────────────────────────────────────
func (c *MetricCollector) collect() *types.MetricSnapshot {
now := time.Now()
duration := now.Sub(c.prevTime).Seconds()
cpuTotal, cpuIdle, cpuIowait, cpuSoftirq, ctxt, intr := c.readSystemStats()
memUsed, memCached, memDirty := c.readMemInfo()
netIn, netOut, netErrs, netDrops, rxPackets, txPackets := c.readNetDev()
retrans := c.readSNMPStats()
timeouts, lostRetrans, fastRetrans := c.readNetstat()
softDropped, softSqueeze := c.readSoftnetStat()
diskRead, diskWrite, diskReadTime, diskWriteTime, diskIOTicks, readsComp, writesComp := c.readDiskStats()
if c.firstSample {
c.storePrev(now,
cpuTotal, cpuIdle, cpuIowait, cpuSoftirq, ctxt, intr,
netIn, netOut, netErrs, netDrops, rxPackets, txPackets,
retrans, timeouts, lostRetrans, fastRetrans,
softDropped, softSqueeze,
diskRead, diskWrite, diskReadTime, diskWriteTime, diskIOTicks, readsComp, writesComp)
c.firstSample = false
return nil
}
if duration < 1e-6 {
duration = 1e-6
}
cpuDelta := saturatingSub(cpuTotal, c.prevCPUTotal)
cpuIdleDelta := saturatingSub(cpuIdle, c.prevCPUIdle)
cpuPercent, cpuIowaitPercent, cpuSoftirqPercent := 0.0, 0.0, 0.0
if cpuDelta > 0 {
cpuPercent = float64(cpuDelta-cpuIdleDelta) / float64(cpuDelta) * 100.0
cpuIowaitPercent = float64(saturatingSub(cpuIowait, c.prevCPUIoWait)) / float64(cpuDelta) * 100.0
cpuSoftirqPercent = float64(saturatingSub(cpuSoftirq, c.prevCPUSoftIrq)) / float64(cpuDelta) * 100.0
}
snap := &types.MetricSnapshot{
Timestamp: now,
CPUPercent: cpuPercent,
CPUIoWaitPercent: cpuIowaitPercent,
CPUSoftIrqPercent: cpuSoftirqPercent,
ContextSwitchesPerS: float64(saturatingSub(ctxt, c.prevCtxt)) / duration,
InterruptsPerS: float64(saturatingSub(intr, c.prevIntr)) / duration,
MemoryUsedMB: float64(memUsed),
MemoryCachedMB: float64(memCached),
MemoryDirtyMB: float64(memDirty),
NetworkInMBps: float64(saturatingSub(netIn, c.prevNetIn)) / duration / 1_048_576,
NetworkOutMBps: float64(saturatingSub(netOut, c.prevNetOut)) / duration / 1_048_576,
NetErrorsPerS: float64(saturatingSub(netErrs, c.prevNetErrs)) / duration,
NetDropsPerS: float64(saturatingSub(netDrops, c.prevNetDrops)) / duration,
TCPRetransPerS: float64(saturatingSub(retrans, c.prevTCPRetrans)) / duration,
TCPTimeoutsPerS: float64(saturatingSub(timeouts, c.prevTCPTimeouts)) / duration,
TCPLostRetransmitPerS: float64(saturatingSub(lostRetrans, c.prevTCPLostRetrans)) / duration,
TCPFastRetransPerS: float64(saturatingSub(fastRetrans, c.prevTCPFastRetrans)) / duration,
SoftnetDroppedPerS: float64(saturatingSub(softDropped, c.prevSoftnetDropped)) / duration,
SoftnetTimeSqueezePerS: float64(saturatingSub(softSqueeze, c.prevSoftnetSqueeze)) / duration,
DiskReadMBps: float64(saturatingSub(diskRead, c.prevDiskRead)) / duration / 1_048_576,
DiskWriteMBps: float64(saturatingSub(diskWrite, c.prevDiskWrite)) / duration / 1_048_576,
DiskReadTimeMsPerS: float64(saturatingSub(diskReadTime, c.prevDiskReadTimeMs)) / duration,
DiskWriteTimeMsPerS: float64(saturatingSub(diskWriteTime, c.prevDiskWriteTimeMs)) / duration,
DiskIOTicksPerS: float64(saturatingSub(diskIOTicks, c.prevDiskIOTicks)) / duration,
NetPacketsInPerS: float64(saturatingSub(rxPackets, c.prevNetPacketsIn)) / duration,
NetPacketsOutPerS: float64(saturatingSub(txPackets, c.prevNetPacketsOut)) / duration,
DiskReadsCompletedPerS: float64(saturatingSub(readsComp, c.prevDiskReadsComp)) / duration,
DiskWritesCompletedPerS: float64(saturatingSub(writesComp, c.prevDiskWritesComp)) / duration,
}
c.storePrev(now,
cpuTotal, cpuIdle, cpuIowait, cpuSoftirq, ctxt, intr,
netIn, netOut, netErrs, netDrops, rxPackets, txPackets,
retrans, timeouts, lostRetrans, fastRetrans,
softDropped, softSqueeze,
diskRead, diskWrite, diskReadTime, diskWriteTime, diskIOTicks, readsComp, writesComp)
return snap
}
func (c *MetricCollector) storePrev(
now time.Time,
cpuTotal, cpuIdle, cpuIowait, cpuSoftirq, ctxt, intr,
netIn, netOut, netErrs, netDrops, rxPackets, txPackets,
retrans, timeouts, lostRetrans, fastRetrans,
softDropped, softSqueeze,
diskRead, diskWrite, diskReadTime, diskWriteTime, diskIOTicks, readsComp, writesComp uint64,
) {
c.prevTime = now
c.prevCPUTotal = cpuTotal
c.prevCPUIdle = cpuIdle
c.prevCPUIoWait = cpuIowait
c.prevCPUSoftIrq = cpuSoftirq
c.prevCtxt = ctxt
c.prevIntr = intr
c.prevNetIn = netIn
c.prevNetOut = netOut
c.prevNetErrs = netErrs
c.prevNetDrops = netDrops
c.prevTCPRetrans = retrans
c.prevTCPTimeouts = timeouts
c.prevTCPLostRetrans = lostRetrans
c.prevTCPFastRetrans = fastRetrans
c.prevSoftnetDropped = softDropped
c.prevSoftnetSqueeze = softSqueeze
c.prevDiskRead = diskRead
c.prevDiskWrite = diskWrite
c.prevDiskReadTimeMs = diskReadTime
c.prevDiskWriteTimeMs = diskWriteTime
c.prevDiskIOTicks = diskIOTicks
c.prevNetPacketsIn = rxPackets
c.prevNetPacketsOut = txPackets
c.prevDiskReadsComp = readsComp
c.prevDiskWritesComp = writesComp
}
// ── /proc readers ─────────────────────────────────────────────────────────────
// readSystemStats reads /proc/stat and returns cumulative CPU jiffies
// (total, idle, iowait, softirq) plus cumulative context-switches and
// interrupt counts.
//
// /proc/stat CPU column layout:
//
// col 1=user 2=nice 3=system 4=idle 5=iowait 6=irq 7=softirq
func (c *MetricCollector) readSystemStats() (total, idle, iowait, softirq, ctxt, intr uint64) {
f, err := os.Open("/proc/stat")
if err != nil {
log.Printf("metric: open /proc/stat: %v", err)
return
}
defer f.Close()
scanner := bufio.NewScanner(f)
for scanner.Scan() {
fields := strings.Fields(scanner.Text())
if len(fields) == 0 {
continue
}
switch fields[0] {
case "cpu":
for i := 1; i < len(fields); i++ {
v, _ := strconv.ParseUint(fields[i], 10, 64)
total += v
switch i {
case 4:
idle = v
case 5:
iowait = v
case 7:
softirq = v
}
}
case "ctxt":
if len(fields) > 1 {
ctxt, _ = strconv.ParseUint(fields[1], 10, 64)
}
case "intr":
if len(fields) > 1 {
intr, _ = strconv.ParseUint(fields[1], 10, 64)
}
}
}
if err := scanner.Err(); err != nil {
log.Printf("metric: scan /proc/stat: %v", err)
}
return
}
func (c *MetricCollector) readMemInfo() (used, cached, dirty uint64) {
f, err := os.Open("/proc/meminfo")
if err != nil {
log.Printf("metric: open /proc/meminfo: %v", err)
return
}
defer f.Close()
var total, available uint64
scanner := bufio.NewScanner(f)
for scanner.Scan() {
fields := strings.Fields(scanner.Text())
if len(fields) < 2 {
continue
}
val, _ := strconv.ParseUint(fields[1], 10, 64)
switch fields[0] {
case "MemTotal:":
total = val
case "MemAvailable:":
available = val
case "Cached:":
cached = val / 1024 // kB → MB
case "Dirty:":
dirty = val / 1024 // kB → MB
}
}
if err := scanner.Err(); err != nil {
log.Printf("metric: scan /proc/meminfo: %v", err)
}
if total >= available {
used = (total - available) / 1024
}
return
}
// readNetDev reads /proc/net/dev for the configured interface.
//
// /proc/net/dev column layout (after stripping "iface:"):
//
// 0=rx_bytes 1=rx_packets 2=rx_errs 3=rx_drop
// 4=rx_fifo 5=rx_frame 6=rx_compressed 7=rx_multicast
// 8=tx_bytes 9=tx_packets 10=tx_errs 11=tx_drop ...
// 8=tx_bytes 9=tx_packets 10=tx_errs 11=tx_drop ...
func (c *MetricCollector) readNetDev() (rxBytes, txBytes, errs, drops, rxPackets, txPackets uint64) {
f, err := os.Open("/proc/net/dev")
if err != nil {
return 0, 0, 0, 0, 0, 0
}
defer f.Close()
prefix := c.netInterface + ":"
scanner := bufio.NewScanner(f)
for scanner.Scan() {
line := strings.TrimSpace(scanner.Text())
if !strings.HasPrefix(line, prefix) {
continue
}
line = strings.TrimPrefix(line, prefix)
fields := strings.Fields(line)
if len(fields) < 12 {
log.Printf("metric: unexpected /proc/net/dev format for %q", c.netInterface)
return 0, 0, 0, 0, 0, 0
}
rxBytes, _ = strconv.ParseUint(fields[0], 10, 64)
rxPackets, _ = strconv.ParseUint(fields[1], 10, 64)
rxErrs, _ := strconv.ParseUint(fields[2], 10, 64)
rxDrops, _ := strconv.ParseUint(fields[3], 10, 64)
txBytes, _ = strconv.ParseUint(fields[8], 10, 64)
txPackets, _ = strconv.ParseUint(fields[9], 10, 64)
txErrs, _ := strconv.ParseUint(fields[10], 10, 64)
txDrops, _ := strconv.ParseUint(fields[11], 10, 64)
return rxBytes, txBytes, rxErrs + txErrs, rxDrops + txDrops, rxPackets, txPackets
}
if err := scanner.Err(); err != nil {
log.Printf("metric: scan /proc/net/dev: %v", err)
}
return 0, 0, 0, 0, 0, 0
}
// readSNMPStats reads RetransSegs from /proc/net/snmp (Tcp section).
//
// /proc/net/snmp Tcp header order (kernel-stable):
//
// RtoAlgorithm RtoMin RtoMax MaxConn ActiveOpens PassiveOpens
// AttemptFails EstabResets CurrEstab InSegs OutSegs RetransSegs InErrs OutRsts
//
// RetransSegs is at index 12 (0-based) in the value row.
func (c *MetricCollector) readSNMPStats() uint64 {
f, err := os.Open("/proc/net/snmp")
if err != nil {
return 0
}
defer f.Close()
// The file alternates header/value rows for each protocol block.
// We need both rows to find RetransSegs by column name.
scanner := bufio.NewScanner(f)
var tcpHeader []string
for scanner.Scan() {
line := scanner.Text()
if !strings.HasPrefix(line, "Tcp:") {
continue
}
fields := strings.Fields(line)
if tcpHeader == nil {
tcpHeader = fields // first Tcp: line is the header
continue
}
// second Tcp: line is the values
for i, h := range tcpHeader {
if h == "RetransSegs" && i < len(fields) {
v, _ := strconv.ParseUint(fields[i], 10, 64)
return v
}
}
}
if err := scanner.Err(); err != nil {
log.Printf("metric: scan /proc/net/snmp: %v", err)
}
return 0
}
// readNetstat reads TCPTimeouts, TCPLostRetransmit and TCPFastRetrans from
// /proc/net/netstat (TcpExt section). The file alternates header/value rows.
func (c *MetricCollector) readNetstat() (timeouts, lostRetrans, fastRetrans uint64) {
f, err := os.Open("/proc/net/netstat")
if err != nil {
return 0, 0, 0
}
defer f.Close()
scanner := bufio.NewScanner(f)
var headers []string
for scanner.Scan() {
line := scanner.Text()
if !strings.HasPrefix(line, "TcpExt:") {
continue
}
fields := strings.Fields(line)
if headers == nil {
headers = fields
continue
}
// value row
for i, h := range headers {
if i >= len(fields) {
break
}
switch h {
case "TCPTimeouts":
timeouts, _ = strconv.ParseUint(fields[i], 10, 64)
case "TCPLostRetransmit":
lostRetrans, _ = strconv.ParseUint(fields[i], 10, 64)
case "TCPFastRetrans":
fastRetrans, _ = strconv.ParseUint(fields[i], 10, 64)
}
}
}
if err := scanner.Err(); err != nil {
log.Printf("metric: scan /proc/net/netstat: %v", err)
}
return
}
// readSoftnetStat reads /proc/net/softnet_stat and sums dropped and
// time_squeeze across all CPU columns (hex values).
func (c *MetricCollector) readSoftnetStat() (dropped, timeSqueeze uint64) {
f, err := os.Open("/proc/net/softnet_stat")
if err != nil {
return 0, 0
}
defer f.Close()
scanner := bufio.NewScanner(f)
for scanner.Scan() {
fields := strings.Fields(scanner.Text())
// col 0 = total, col 1 = dropped, col 2 = time_squeeze
if len(fields) >= 3 {
d, _ := strconv.ParseUint(fields[1], 16, 64)
t, _ := strconv.ParseUint(fields[2], 16, 64)
dropped += d
timeSqueeze += t
}
}
if err := scanner.Err(); err != nil {
log.Printf("metric: scan /proc/net/softnet_stat: %v", err)
}
return
}
// readDiskStats reads /proc/diskstats for the configured device.
//
// /proc/diskstats column layout (kernel ≥ 4.18):
//
// 0=major 1=minor 2=name
// 3=reads_completed 4=reads_merged 5=sectors_read 6=read_time_ms
// 7=writes_completed 8=writes_merged 9=sectors_written 10=write_time_ms
// 11=io_in_progress 12=io_ticks_ms 13=weighted_io_ticks
// 11=io_in_progress 12=io_ticks_ms 13=weighted_io_ticks
func (c *MetricCollector) readDiskStats() (readBytes, writeBytes, readTimeMs, writeTimeMs, ioTicks, readsComp, writesComp uint64) {
f, err := os.Open("/proc/diskstats")
if err != nil {
log.Printf("metric: open /proc/diskstats: %v", err)
return
}
defer f.Close()
scanner := bufio.NewScanner(f)
for scanner.Scan() {
fields := strings.Fields(scanner.Text())
if len(fields) < 14 || fields[2] != c.diskDevice {
continue
}
readsComp, _ = strconv.ParseUint(fields[3], 10, 64)
writesComp, _ = strconv.ParseUint(fields[7], 10, 64)
rSectors, _ := strconv.ParseUint(fields[5], 10, 64)
wSectors, _ := strconv.ParseUint(fields[9], 10, 64)
rTime, _ := strconv.ParseUint(fields[6], 10, 64)
wTime, _ := strconv.ParseUint(fields[10], 10, 64)
ticks, _ := strconv.ParseUint(fields[12], 10, 64)
return rSectors * 512, wSectors * 512, rTime, wTime, ticks, readsComp, writesComp
}
if err := scanner.Err(); err != nil {
log.Printf("metric: scan /proc/diskstats: %v", err)
}
return
}
// ── health ────────────────────────────────────────────────────────────────────
func (c *MetricCollector) emitHealth() {
p := c.processed.Load()
d := c.dropped.Load()
select {
case c.healthChan <- types.StageHealth{
StageName: "metric_collector",
EventsProcessed: p,
EventsDropped: d,
Throughput: float64(p) / 5.0,
LastUpdate: time.Now(),
}:
default:
}
}
// ── helpers ───────────────────────────────────────────────────────────────────
// saturatingSub returns a b, clamped to 0 on underflow.
// 64-bit /proc counters very rarely wrap, but saturation prevents negative rates.
func saturatingSub(a, b uint64) uint64 {
if a >= b {
return a - b
}
return 0
}