package collector import ( "bufio" "context" "log" "os" "strconv" "strings" "sync" "sync/atomic" "time" "codeberg.org/pata1704/guenther/pkg/types" ) // MetricCollector samples Linux system metrics from /proc at a fixed interval // and emits a types.MetricSnapshot for each sample. // // All /proc reads happen in the single collector goroutine, so no locking is // required for the delta-state fields. The output channel uses a non-blocking // send; overflows are counted in the dropped counter via load-shedding. type MetricCollector struct { outputChan chan<- types.MetricSnapshot healthChan chan<- types.StageHealth interval time.Duration netInterface string diskDevice string wg sync.WaitGroup // Delta state – only accessed from the single collector goroutine. prevSoftnetDropped uint64 prevSoftnetSqueeze uint64 prevNetPacketsIn uint64 prevNetPacketsOut uint64 prevDiskReadsComp uint64 prevDiskWritesComp uint64 prevDiskRead uint64 prevDiskWrite uint64 prevDiskReadTimeMs uint64 prevDiskWriteTimeMs uint64 prevDiskIOTicks uint64 prevCPUTotal uint64 prevCPUIdle uint64 prevCPUIoWait uint64 prevCPUSoftIrq uint64 prevCtxt uint64 prevIntr uint64 prevNetIn uint64 prevNetOut uint64 prevNetErrs uint64 prevNetDrops uint64 prevTCPRetrans uint64 prevTCPTimeouts uint64 prevTCPLostRetrans uint64 prevTCPFastRetrans uint64 prevTime time.Time firstSample bool processed atomic.Uint64 dropped atomic.Uint64 } func NewMetricCollector( output chan<- types.MetricSnapshot, health chan<- types.StageHealth, interval time.Duration, netIntf, diskDev string, ) *MetricCollector { return &MetricCollector{ outputChan: output, healthChan: health, interval: interval, netInterface: netIntf, diskDevice: diskDev, firstSample: true, } } func (c *MetricCollector) Start(ctx context.Context) { ticker := time.NewTicker(c.interval) reportTicker := time.NewTicker(5 * time.Second) c.prevTime = time.Now() c.wg.Go(func() { defer ticker.Stop() defer reportTicker.Stop() for { select { case <-ticker.C: snap := c.collect() if snap == nil { continue } select { case c.outputChan <- *snap: c.processed.Add(1) default: c.dropped.Add(1) } case <-reportTicker.C: c.emitHealth() case <-ctx.Done(): return } } }) } // Wait waits for the collector goroutine to exit after context cancellation. func (c *MetricCollector) Wait() { c.wg.Wait() } // ── collection ──────────────────────────────────────────────────────────────── func (c *MetricCollector) collect() *types.MetricSnapshot { now := time.Now() duration := now.Sub(c.prevTime).Seconds() cpuTotal, cpuIdle, cpuIowait, cpuSoftirq, ctxt, intr := c.readSystemStats() memUsed, memCached, memDirty := c.readMemInfo() netIn, netOut, netErrs, netDrops, rxPackets, txPackets := c.readNetDev() retrans := c.readSNMPStats() timeouts, lostRetrans, fastRetrans := c.readNetstat() softDropped, softSqueeze := c.readSoftnetStat() diskRead, diskWrite, diskReadTime, diskWriteTime, diskIOTicks, readsComp, writesComp := c.readDiskStats() if c.firstSample { c.storePrev(now, cpuTotal, cpuIdle, cpuIowait, cpuSoftirq, ctxt, intr, netIn, netOut, netErrs, netDrops, rxPackets, txPackets, retrans, timeouts, lostRetrans, fastRetrans, softDropped, softSqueeze, diskRead, diskWrite, diskReadTime, diskWriteTime, diskIOTicks, readsComp, writesComp) c.firstSample = false return nil } if duration < 1e-6 { duration = 1e-6 } cpuDelta := saturatingSub(cpuTotal, c.prevCPUTotal) cpuIdleDelta := saturatingSub(cpuIdle, c.prevCPUIdle) cpuPercent, cpuIowaitPercent, cpuSoftirqPercent := 0.0, 0.0, 0.0 if cpuDelta > 0 { cpuPercent = float64(cpuDelta-cpuIdleDelta) / float64(cpuDelta) * 100.0 cpuIowaitPercent = float64(saturatingSub(cpuIowait, c.prevCPUIoWait)) / float64(cpuDelta) * 100.0 cpuSoftirqPercent = float64(saturatingSub(cpuSoftirq, c.prevCPUSoftIrq)) / float64(cpuDelta) * 100.0 } snap := &types.MetricSnapshot{ Timestamp: now, CPUPercent: cpuPercent, CPUIoWaitPercent: cpuIowaitPercent, CPUSoftIrqPercent: cpuSoftirqPercent, ContextSwitchesPerS: float64(saturatingSub(ctxt, c.prevCtxt)) / duration, InterruptsPerS: float64(saturatingSub(intr, c.prevIntr)) / duration, MemoryUsedMB: float64(memUsed), MemoryCachedMB: float64(memCached), MemoryDirtyMB: float64(memDirty), NetworkInMBps: float64(saturatingSub(netIn, c.prevNetIn)) / duration / 1_048_576, NetworkOutMBps: float64(saturatingSub(netOut, c.prevNetOut)) / duration / 1_048_576, NetErrorsPerS: float64(saturatingSub(netErrs, c.prevNetErrs)) / duration, NetDropsPerS: float64(saturatingSub(netDrops, c.prevNetDrops)) / duration, TCPRetransPerS: float64(saturatingSub(retrans, c.prevTCPRetrans)) / duration, TCPTimeoutsPerS: float64(saturatingSub(timeouts, c.prevTCPTimeouts)) / duration, TCPLostRetransmitPerS: float64(saturatingSub(lostRetrans, c.prevTCPLostRetrans)) / duration, TCPFastRetransPerS: float64(saturatingSub(fastRetrans, c.prevTCPFastRetrans)) / duration, SoftnetDroppedPerS: float64(saturatingSub(softDropped, c.prevSoftnetDropped)) / duration, SoftnetTimeSqueezePerS: float64(saturatingSub(softSqueeze, c.prevSoftnetSqueeze)) / duration, DiskReadMBps: float64(saturatingSub(diskRead, c.prevDiskRead)) / duration / 1_048_576, DiskWriteMBps: float64(saturatingSub(diskWrite, c.prevDiskWrite)) / duration / 1_048_576, DiskReadTimeMsPerS: float64(saturatingSub(diskReadTime, c.prevDiskReadTimeMs)) / duration, DiskWriteTimeMsPerS: float64(saturatingSub(diskWriteTime, c.prevDiskWriteTimeMs)) / duration, DiskIOTicksPerS: float64(saturatingSub(diskIOTicks, c.prevDiskIOTicks)) / duration, NetPacketsInPerS: float64(saturatingSub(rxPackets, c.prevNetPacketsIn)) / duration, NetPacketsOutPerS: float64(saturatingSub(txPackets, c.prevNetPacketsOut)) / duration, DiskReadsCompletedPerS: float64(saturatingSub(readsComp, c.prevDiskReadsComp)) / duration, DiskWritesCompletedPerS: float64(saturatingSub(writesComp, c.prevDiskWritesComp)) / duration, } c.storePrev(now, cpuTotal, cpuIdle, cpuIowait, cpuSoftirq, ctxt, intr, netIn, netOut, netErrs, netDrops, rxPackets, txPackets, retrans, timeouts, lostRetrans, fastRetrans, softDropped, softSqueeze, diskRead, diskWrite, diskReadTime, diskWriteTime, diskIOTicks, readsComp, writesComp) return snap } func (c *MetricCollector) storePrev( now time.Time, cpuTotal, cpuIdle, cpuIowait, cpuSoftirq, ctxt, intr, netIn, netOut, netErrs, netDrops, rxPackets, txPackets, retrans, timeouts, lostRetrans, fastRetrans, softDropped, softSqueeze, diskRead, diskWrite, diskReadTime, diskWriteTime, diskIOTicks, readsComp, writesComp uint64, ) { c.prevTime = now c.prevCPUTotal = cpuTotal c.prevCPUIdle = cpuIdle c.prevCPUIoWait = cpuIowait c.prevCPUSoftIrq = cpuSoftirq c.prevCtxt = ctxt c.prevIntr = intr c.prevNetIn = netIn c.prevNetOut = netOut c.prevNetErrs = netErrs c.prevNetDrops = netDrops c.prevTCPRetrans = retrans c.prevTCPTimeouts = timeouts c.prevTCPLostRetrans = lostRetrans c.prevTCPFastRetrans = fastRetrans c.prevSoftnetDropped = softDropped c.prevSoftnetSqueeze = softSqueeze c.prevDiskRead = diskRead c.prevDiskWrite = diskWrite c.prevDiskReadTimeMs = diskReadTime c.prevDiskWriteTimeMs = diskWriteTime c.prevDiskIOTicks = diskIOTicks c.prevNetPacketsIn = rxPackets c.prevNetPacketsOut = txPackets c.prevDiskReadsComp = readsComp c.prevDiskWritesComp = writesComp } // ── /proc readers ───────────────────────────────────────────────────────────── // readSystemStats reads /proc/stat and returns cumulative CPU jiffies // (total, idle, iowait, softirq) plus cumulative context-switches and // interrupt counts. // // /proc/stat CPU column layout: // // col 1=user 2=nice 3=system 4=idle 5=iowait 6=irq 7=softirq func (c *MetricCollector) readSystemStats() (total, idle, iowait, softirq, ctxt, intr uint64) { f, err := os.Open("/proc/stat") if err != nil { log.Printf("metric: open /proc/stat: %v", err) return } defer f.Close() scanner := bufio.NewScanner(f) for scanner.Scan() { fields := strings.Fields(scanner.Text()) if len(fields) == 0 { continue } switch fields[0] { case "cpu": for i := 1; i < len(fields); i++ { v, _ := strconv.ParseUint(fields[i], 10, 64) total += v switch i { case 4: idle = v case 5: iowait = v case 7: softirq = v } } case "ctxt": if len(fields) > 1 { ctxt, _ = strconv.ParseUint(fields[1], 10, 64) } case "intr": if len(fields) > 1 { intr, _ = strconv.ParseUint(fields[1], 10, 64) } } } if err := scanner.Err(); err != nil { log.Printf("metric: scan /proc/stat: %v", err) } return } func (c *MetricCollector) readMemInfo() (used, cached, dirty uint64) { f, err := os.Open("/proc/meminfo") if err != nil { log.Printf("metric: open /proc/meminfo: %v", err) return } defer f.Close() var total, available uint64 scanner := bufio.NewScanner(f) for scanner.Scan() { fields := strings.Fields(scanner.Text()) if len(fields) < 2 { continue } val, _ := strconv.ParseUint(fields[1], 10, 64) switch fields[0] { case "MemTotal:": total = val case "MemAvailable:": available = val case "Cached:": cached = val / 1024 // kB → MB case "Dirty:": dirty = val / 1024 // kB → MB } } if err := scanner.Err(); err != nil { log.Printf("metric: scan /proc/meminfo: %v", err) } if total >= available { used = (total - available) / 1024 } return } // readNetDev reads /proc/net/dev for the configured interface. // // /proc/net/dev column layout (after stripping "iface:"): // // 0=rx_bytes 1=rx_packets 2=rx_errs 3=rx_drop // 4=rx_fifo 5=rx_frame 6=rx_compressed 7=rx_multicast // 8=tx_bytes 9=tx_packets 10=tx_errs 11=tx_drop ... // 8=tx_bytes 9=tx_packets 10=tx_errs 11=tx_drop ... func (c *MetricCollector) readNetDev() (rxBytes, txBytes, errs, drops, rxPackets, txPackets uint64) { f, err := os.Open("/proc/net/dev") if err != nil { return 0, 0, 0, 0, 0, 0 } defer f.Close() prefix := c.netInterface + ":" scanner := bufio.NewScanner(f) for scanner.Scan() { line := strings.TrimSpace(scanner.Text()) if !strings.HasPrefix(line, prefix) { continue } line = strings.TrimPrefix(line, prefix) fields := strings.Fields(line) if len(fields) < 12 { log.Printf("metric: unexpected /proc/net/dev format for %q", c.netInterface) return 0, 0, 0, 0, 0, 0 } rxBytes, _ = strconv.ParseUint(fields[0], 10, 64) rxPackets, _ = strconv.ParseUint(fields[1], 10, 64) rxErrs, _ := strconv.ParseUint(fields[2], 10, 64) rxDrops, _ := strconv.ParseUint(fields[3], 10, 64) txBytes, _ = strconv.ParseUint(fields[8], 10, 64) txPackets, _ = strconv.ParseUint(fields[9], 10, 64) txErrs, _ := strconv.ParseUint(fields[10], 10, 64) txDrops, _ := strconv.ParseUint(fields[11], 10, 64) return rxBytes, txBytes, rxErrs + txErrs, rxDrops + txDrops, rxPackets, txPackets } if err := scanner.Err(); err != nil { log.Printf("metric: scan /proc/net/dev: %v", err) } return 0, 0, 0, 0, 0, 0 } // readSNMPStats reads RetransSegs from /proc/net/snmp (Tcp section). // // /proc/net/snmp Tcp header order (kernel-stable): // // RtoAlgorithm RtoMin RtoMax MaxConn ActiveOpens PassiveOpens // AttemptFails EstabResets CurrEstab InSegs OutSegs RetransSegs InErrs OutRsts // // RetransSegs is at index 12 (0-based) in the value row. func (c *MetricCollector) readSNMPStats() uint64 { f, err := os.Open("/proc/net/snmp") if err != nil { return 0 } defer f.Close() // The file alternates header/value rows for each protocol block. // We need both rows to find RetransSegs by column name. scanner := bufio.NewScanner(f) var tcpHeader []string for scanner.Scan() { line := scanner.Text() if !strings.HasPrefix(line, "Tcp:") { continue } fields := strings.Fields(line) if tcpHeader == nil { tcpHeader = fields // first Tcp: line is the header continue } // second Tcp: line is the values for i, h := range tcpHeader { if h == "RetransSegs" && i < len(fields) { v, _ := strconv.ParseUint(fields[i], 10, 64) return v } } } if err := scanner.Err(); err != nil { log.Printf("metric: scan /proc/net/snmp: %v", err) } return 0 } // readNetstat reads TCPTimeouts, TCPLostRetransmit and TCPFastRetrans from // /proc/net/netstat (TcpExt section). The file alternates header/value rows. func (c *MetricCollector) readNetstat() (timeouts, lostRetrans, fastRetrans uint64) { f, err := os.Open("/proc/net/netstat") if err != nil { return 0, 0, 0 } defer f.Close() scanner := bufio.NewScanner(f) var headers []string for scanner.Scan() { line := scanner.Text() if !strings.HasPrefix(line, "TcpExt:") { continue } fields := strings.Fields(line) if headers == nil { headers = fields continue } // value row for i, h := range headers { if i >= len(fields) { break } switch h { case "TCPTimeouts": timeouts, _ = strconv.ParseUint(fields[i], 10, 64) case "TCPLostRetransmit": lostRetrans, _ = strconv.ParseUint(fields[i], 10, 64) case "TCPFastRetrans": fastRetrans, _ = strconv.ParseUint(fields[i], 10, 64) } } } if err := scanner.Err(); err != nil { log.Printf("metric: scan /proc/net/netstat: %v", err) } return } // readSoftnetStat reads /proc/net/softnet_stat and sums dropped and // time_squeeze across all CPU columns (hex values). func (c *MetricCollector) readSoftnetStat() (dropped, timeSqueeze uint64) { f, err := os.Open("/proc/net/softnet_stat") if err != nil { return 0, 0 } defer f.Close() scanner := bufio.NewScanner(f) for scanner.Scan() { fields := strings.Fields(scanner.Text()) // col 0 = total, col 1 = dropped, col 2 = time_squeeze if len(fields) >= 3 { d, _ := strconv.ParseUint(fields[1], 16, 64) t, _ := strconv.ParseUint(fields[2], 16, 64) dropped += d timeSqueeze += t } } if err := scanner.Err(); err != nil { log.Printf("metric: scan /proc/net/softnet_stat: %v", err) } return } // readDiskStats reads /proc/diskstats for the configured device. // // /proc/diskstats column layout (kernel ≥ 4.18): // // 0=major 1=minor 2=name // 3=reads_completed 4=reads_merged 5=sectors_read 6=read_time_ms // 7=writes_completed 8=writes_merged 9=sectors_written 10=write_time_ms // 11=io_in_progress 12=io_ticks_ms 13=weighted_io_ticks // 11=io_in_progress 12=io_ticks_ms 13=weighted_io_ticks func (c *MetricCollector) readDiskStats() (readBytes, writeBytes, readTimeMs, writeTimeMs, ioTicks, readsComp, writesComp uint64) { f, err := os.Open("/proc/diskstats") if err != nil { log.Printf("metric: open /proc/diskstats: %v", err) return } defer f.Close() scanner := bufio.NewScanner(f) for scanner.Scan() { fields := strings.Fields(scanner.Text()) if len(fields) < 14 || fields[2] != c.diskDevice { continue } readsComp, _ = strconv.ParseUint(fields[3], 10, 64) writesComp, _ = strconv.ParseUint(fields[7], 10, 64) rSectors, _ := strconv.ParseUint(fields[5], 10, 64) wSectors, _ := strconv.ParseUint(fields[9], 10, 64) rTime, _ := strconv.ParseUint(fields[6], 10, 64) wTime, _ := strconv.ParseUint(fields[10], 10, 64) ticks, _ := strconv.ParseUint(fields[12], 10, 64) return rSectors * 512, wSectors * 512, rTime, wTime, ticks, readsComp, writesComp } if err := scanner.Err(); err != nil { log.Printf("metric: scan /proc/diskstats: %v", err) } return } // ── health ──────────────────────────────────────────────────────────────────── func (c *MetricCollector) emitHealth() { p := c.processed.Load() d := c.dropped.Load() select { case c.healthChan <- types.StageHealth{ StageName: "metric_collector", EventsProcessed: p, EventsDropped: d, Throughput: float64(p) / 5.0, LastUpdate: time.Now(), }: default: } } // ── helpers ─────────────────────────────────────────────────────────────────── // saturatingSub returns a − b, clamped to 0 on underflow. // 64-bit /proc counters very rarely wrap, but saturation prevents negative rates. func saturatingSub(a, b uint64) uint64 { if a >= b { return a - b } return 0 }