commit for version used in evaluation of thesis

This commit is contained in:
Patryk Hegenberg 2026-03-29 10:03:18 +02:00
commit 72635dc7b9
27 changed files with 6084 additions and 0 deletions

250
internal/collector/log.go Normal file
View file

@ -0,0 +1,250 @@
package collector
import (
"bufio"
"context"
"fmt"
"io"
"log"
"os"
"strings"
"sync"
"sync/atomic"
"time"
drain3go "codeberg.org/pata1704/drain3"
"codeberg.org/pata1704/guenther/internal/config"
idrain3 "codeberg.org/pata1704/guenther/internal/drain3"
"codeberg.org/pata1704/guenther/pkg/types"
"github.com/fsnotify/fsnotify"
)
// linePool recycles *strings.Builder instances used in the line-read hot path
// to reduce allocations when processing high-volume log files.
var linePool = sync.Pool{
New: func() any { return new(strings.Builder) },
}
// LogCollector tails a log file using inotify (fsnotify) and emits a
// types.LogEvent for every non-empty line.
//
// Processing pipeline per line:
// 1. ApplyMasking extracts named parameters and masks the line.
// 2. Drain3.Parse mines a template ID from the masked line.
// 3. Severity classified from the raw line.
// 4. Emit non-blocking channel send with drop counter.
//
// The collector uses a single goroutine per file and a WaitGroup for clean
// shutdown.
type LogCollector struct {
cfg *config.Config
miner *drain3go.TemplateMiner
outputChan chan<- types.LogEvent
healthChan chan<- types.StageHealth
wg sync.WaitGroup
processed atomic.Uint64
dropped atomic.Uint64
}
// NewLogCollector creates a LogCollector wired to the provided channels.
// Drain3 is initialised with an in-memory persistence store; the template
// tree is rebuilt from scratch on restart (state persistence can be added
// via FilePersistence if needed).
func NewLogCollector(
cfg *config.Config,
output chan<- types.LogEvent,
health chan<- types.StageHealth,
) *LogCollector {
dc := drain3go.DefaultConfig()
dc.SimTh = cfg.Drain.SimThreshold
dc.Depth = cfg.Drain.Depth
dc.MaxChildren = cfg.Drain.MaxChildren
miner := drain3go.NewTemplateMiner(dc, drain3go.NewMemoryPersistence())
return &LogCollector{
cfg: cfg,
miner: miner,
outputChan: output,
healthChan: health,
}
}
// Start begins tailing cfg.Ingestion.LogPath.
// The method returns an error if the file cannot be opened or if the
// inotify watcher cannot be created. Subsequent errors during tailing are
// logged but do not propagate.
func (c *LogCollector) Start(ctx context.Context) error {
f, err := os.Open(c.cfg.Ingestion.LogPath)
if err != nil {
return fmt.Errorf("log collector: open %q: %w", c.cfg.Ingestion.LogPath, err)
}
// Seek to end: only tail new content, not existing content.
if _, err := f.Seek(0, io.SeekEnd); err != nil {
f.Close()
return fmt.Errorf("log collector: seek %q: %w", c.cfg.Ingestion.LogPath, err)
}
watcher, err := fsnotify.NewWatcher()
if err != nil {
f.Close()
return fmt.Errorf("log collector: create fsnotify watcher: %w", err)
}
if err := watcher.Add(c.cfg.Ingestion.LogPath); err != nil {
f.Close()
watcher.Close()
return fmt.Errorf("log collector: watch %q: %w", c.cfg.Ingestion.LogPath, err)
}
reader := bufio.NewReaderSize(f, 64*1024)
reportTicker := time.NewTicker(5 * time.Second)
c.wg.Go(func() {
defer f.Close()
defer watcher.Close()
defer reportTicker.Stop()
for {
select {
case event, ok := <-watcher.Events:
if !ok {
return
}
if event.Has(fsnotify.Write) {
c.drainReader(reader)
}
if event.Has(fsnotify.Remove) || event.Has(fsnotify.Rename) {
// Log rotation: reopen the file.
log.Printf("log collector: file %q rotated reopening", c.cfg.Ingestion.LogPath)
f.Close()
newF, err := c.reopenFile()
if err != nil {
log.Printf("log collector: reopen after rotation: %v", err)
return
}
f = newF
reader = bufio.NewReaderSize(f, 64*1024)
if err := watcher.Add(c.cfg.Ingestion.LogPath); err != nil {
log.Printf("log collector: re-watch after rotation: %v", err)
}
}
case err, ok := <-watcher.Errors:
if !ok {
return
}
log.Printf("log collector: watcher error: %v", err)
case <-reportTicker.C:
c.emitHealth()
case <-ctx.Done():
return
}
}
})
return nil
}
// Wait waits for the collector goroutine to exit after context cancellation.
func (c *LogCollector) Wait() {
c.wg.Wait()
}
// drainReader reads all complete lines currently available in reader and
// processes each one. Partial lines (no trailing newline) are left in the
// bufio buffer for the next Write event.
func (c *LogCollector) drainReader(r *bufio.Reader) {
for {
line, err := r.ReadString('\n')
if len(line) > 0 {
c.processLine(strings.TrimRight(line, "\r\n"))
}
if err != nil {
// io.EOF means no more complete lines; any other error is logged.
if err != io.EOF {
log.Printf("log collector: read error: %v", err)
}
return
}
}
}
// processLine applies masking, mines a Drain3 template, classifies severity,
// and emits a LogEvent. The send is non-blocking; full channels increment the
// dropped counter if the pipeline is backlogged.
func (c *LogCollector) processLine(line string) {
if line == "" {
return
}
// Phase 1+2: masking and parameter extraction.
masked, params := idrain3.ApplyMasking(line, c.cfg.Drain.MaskingPatterns)
// Phase 3: template mining on the masked line.
result := c.miner.AddLogMessage(masked)
if result == nil {
return
}
event := types.LogEvent{
Timestamp: time.Now(),
TemplateID: result.ClusterID,
Params: params,
Severity: classifySeverity(line),
RawLine: line,
}
select {
case c.outputChan <- event:
c.processed.Add(1)
default:
c.dropped.Add(1)
}
}
// reopenFile opens cfg.Ingestion.LogPath after log rotation, seeking to the
// beginning of the new file.
func (c *LogCollector) reopenFile() (*os.File, error) {
f, err := os.Open(c.cfg.Ingestion.LogPath)
if err != nil {
return nil, fmt.Errorf("open: %w", err)
}
return f, nil
}
// emitHealth sends a StageHealth snapshot; non-blocking (drops if full).
func (c *LogCollector) emitHealth() {
p := c.processed.Load()
d := c.dropped.Load()
select {
case c.healthChan <- types.StageHealth{
StageName: "log_collector",
EventsProcessed: p,
EventsDropped: d,
Throughput: float64(p) / 5.0,
LastUpdate: time.Now(),
}:
default:
}
}
// classifySeverity extracts the severity level from a raw log line by
// scanning for well-known keywords (case-insensitive).
func classifySeverity(line string) string {
upper := strings.ToUpper(line)
switch {
case strings.Contains(upper, "ERROR") || strings.Contains(upper, "FATAL") || strings.Contains(upper, "CRITICAL") || strings.Contains(upper, "ERR"):
return "ERROR"
case strings.Contains(upper, "WARN") || strings.Contains(upper, "WARNING"):
return "WARN"
case strings.Contains(upper, "DEBUG"):
return "DEBUG"
default:
return "INFO"
}
}