diff --git a/plugins/inputs/system/README.md b/plugins/inputs/system/README.md index c81e0e975eabd..6a7076c3980aa 100644 --- a/plugins/inputs/system/README.md +++ b/plugins/inputs/system/README.md @@ -20,9 +20,28 @@ plugin ordering. See [CONFIGURATION.md][CONFIGURATION.md] for more details. ```toml @sample.conf # Read metrics about system load & uptime [[inputs.system]] - # no configuration + ## Information to collect; available options are: + ## load - 1, 5 and 15-minute load averages + ## users - logged-in user counts + ## cpus - CPU counts of the system + ## legacy_cpus - legacy layout of CPU counts; see README for details + ## uptime - system uptime + ## legacy_uptime - legacy layout of system uptime; see README for details + # include = ["load", "users", "legacy_cpus", "legacy_uptime"] ``` +> [!NOTE] +> The `cpus` and `legacy_cpus` options are mutually exclusive, +> as are `uptime` and `legacy_uptime`. + + + +> [!IMPORTANT] +> Switching from `legacy_uptime` to `uptime` changes the Prometheus metric +> type of `system_uptime` from **counter** to **gauge**. If your dashboards +> or alerts use `rate()` or `increase()` on `system_uptime`, update them +> before migrating. + ### Permissions The `n_users` field requires read access to `/var/run/utmp`, and may require the @@ -35,22 +54,43 @@ same requirements for `n_users` apply. ## Metrics -- system - - fields: - - load1 (float) - - load15 (float) - - load5 (float) - - n_users (integer) - - n_unique_users (integer) - - n_cpus (integer) - - n_physical_cpus (integer) - - uptime (integer, seconds) - - uptime_format (string, deprecated in 1.10, use `uptime` field) +### `system` + +All fields below belong to the `system` measurement. The `include` option +controls which groups are gathered. + +| Field | Include option | Type | Description | +|-------------------|----------------------------|---------|---------------------------------------------| +| `load1` | `load` | float | 1-minute load average | +| `load5` | `load` | float | 5-minute load average | +| `load15` | `load` | float | 15-minute load average | +| `n_users` | `users` | integer | Number of logged-in user sessions | +| `n_unique_users` | `users` | integer | Number of unique logged-in usernames | +| `n_virtual_cpus` | `cpus` | integer | Number of logical CPUs | +| `n_cpus` | `legacy_cpus` | integer | Number of logical CPUs (legacy name) | +| `n_physical_cpus` | `cpus` / `legacy_cpus` | integer | Number of physical CPUs | +| `uptime` | `uptime` | integer | System uptime in seconds (gauge field) | +| `uptime` | `legacy_uptime` | integer | System uptime in seconds (separate counter) | +| `uptime_format` | `legacy_uptime` | string | Human-readable uptime (deprecated) | ## Example Output +### Default configuration + +With the default `include = ["load", "users", "legacy_cpus", "legacy_uptime"]`, +the output is backward-compatible with previous versions: + +```text +system,host=worker-01 load1=3.72,load5=2.4,load15=2.1,n_users=3i,n_unique_users=2i,n_cpus=4i,n_physical_cpus=2i 1748000000000000000 +system,host=worker-01 uptime=1249632i 1748000000000000000 +system,host=worker-01 uptime_format="14 days, 11:07" 1748000000000000000 +``` + +### Recommended configuration + +With `include = ["load", "users", "cpus", "uptime"]`, all fields are emitted +in a single metric with the new field names: + ```text -system,host=tyrion load1=3.72,load5=2.4,load15=2.1,n_users=3i,n_cpus=4i,n_physical_cpus=2i 1483964144000000000 -system,host=tyrion uptime=1249632i 1483964144000000000 -system,host=tyrion uptime_format="14 days, 11:07" 1483964144000000000 +system,host=worker-01 load1=3.72,load5=2.4,load15=2.1,n_users=3i,n_unique_users=2i,n_virtual_cpus=4i,n_physical_cpus=2i,uptime=1249632i 1748000000000000000 ``` diff --git a/plugins/inputs/system/sample.conf b/plugins/inputs/system/sample.conf index 03f911c5b0890..b3c23860f436e 100644 --- a/plugins/inputs/system/sample.conf +++ b/plugins/inputs/system/sample.conf @@ -1,3 +1,10 @@ # Read metrics about system load & uptime [[inputs.system]] - # no configuration + ## Information to collect; available options are: + ## load - 1, 5 and 15-minute load averages + ## users - logged-in user counts + ## cpus - CPU counts of the system + ## legacy_cpus - legacy layout of CPU counts; see README for details + ## uptime - system uptime + ## legacy_uptime - legacy layout of system uptime; see README for details + # include = ["load", "users", "legacy_cpus", "legacy_uptime"] diff --git a/plugins/inputs/system/system.go b/plugins/inputs/system/system.go index c4e8e8079cf07..292b02a70aa3b 100644 --- a/plugins/inputs/system/system.go +++ b/plugins/inputs/system/system.go @@ -5,6 +5,7 @@ import ( "bufio" "bytes" _ "embed" + "errors" "fmt" "os" "strings" @@ -15,6 +16,7 @@ import ( "github.com/shirou/gopsutil/v4/load" "github.com/influxdata/telegraf" + "github.com/influxdata/telegraf/config" "github.com/influxdata/telegraf/plugins/inputs" ) @@ -22,61 +24,144 @@ import ( var sampleConfig string type System struct { - Log telegraf.Logger `toml:"-"` + Include []string `toml:"include"` + Log telegraf.Logger `toml:"-"` } func (*System) SampleConfig() string { return sampleConfig } -func (s *System) Gather(acc telegraf.Accumulator) error { - loadavg, err := load.Avg() - if err != nil && !strings.Contains(err.Error(), "not implemented") { - return err +func (s *System) Init() error { + // Suppress deprecation warnings for default-only configs. + userSupplied := len(s.Include) > 0 + if !userSupplied { + s.Include = []string{"load", "users", "legacy_cpus", "legacy_uptime"} } - numLogicalCPUs, err := cpu.Counts(true) - if err != nil { - return err + enabled := make(map[string]bool, len(s.Include)) + deduped := make([]string, 0, len(s.Include)) + for _, incl := range s.Include { + if enabled[incl] { + continue + } + switch incl { + case "load", "users", "cpus", "uptime": + case "legacy_cpus": + if userSupplied { + config.PrintOptionValueDeprecationNotice( + "inputs.system", + "include", + "legacy_cpus", + telegraf.DeprecationInfo{ + Since: "1.39.0", + RemovalIn: "1.45.0", + Notice: "use 'cpus' instead", + }, + ) + } + case "legacy_uptime": + if userSupplied { + config.PrintOptionValueDeprecationNotice( + "inputs.system", + "include", + "legacy_uptime", + telegraf.DeprecationInfo{ + Since: "1.39.0", + RemovalIn: "1.45.0", + Notice: "use 'uptime' instead", + }, + ) + } + default: + return fmt.Errorf("invalid 'include' option %q", incl) + } + enabled[incl] = true + deduped = append(deduped, incl) } + s.Include = deduped - numPhysicalCPUs, err := cpu.Counts(false) - if err != nil { - return err + if enabled["cpus"] && enabled["legacy_cpus"] { + return errors.New(`"cpus" and "legacy_cpus" are mutually exclusive`) } - - fields := map[string]interface{}{ - "load1": loadavg.Load1, - "load5": loadavg.Load5, - "load15": loadavg.Load15, - "n_cpus": numLogicalCPUs, - "n_physical_cpus": numPhysicalCPUs, + if enabled["uptime"] && enabled["legacy_uptime"] { + return errors.New(`"uptime" and "legacy_uptime" are mutually exclusive`) } - users, err := host.Users() - if err == nil { - fields["n_users"] = len(users) - fields["n_unique_users"] = findUniqueUsers(users) - } else if os.IsNotExist(err) { - s.Log.Debugf("Reading users: %s", err.Error()) - } else if os.IsPermission(err) { - s.Log.Debug(err.Error()) - } + return nil +} +func (s *System) Gather(acc telegraf.Accumulator) error { now := time.Now() - acc.AddGauge("system", fields, nil, now) - - uptime, err := host.Uptime() - if err != nil { - return err + fields := make(map[string]interface{}, 8) + + for _, incl := range s.Include { + switch incl { + case "load": + loadavg, err := load.Avg() + if err != nil { + if !strings.Contains(err.Error(), "not implemented") { + acc.AddError(fmt.Errorf("reading load averages: %w", err)) + } + continue + } + fields["load1"] = loadavg.Load1 + fields["load5"] = loadavg.Load5 + fields["load15"] = loadavg.Load15 + case "users": + users, err := host.Users() + if err == nil { + fields["n_users"] = len(users) + fields["n_unique_users"] = findUniqueUsers(users) + } else if os.IsNotExist(err) { + s.Log.Debugf("Reading users: %s", err.Error()) + } else if os.IsPermission(err) { + s.Log.Debug(err.Error()) + } else { + s.Log.Warnf("Reading users: %s", err.Error()) + } + case "cpus", "legacy_cpus": + numLogicalCPUs, err := cpu.Counts(true) + if err != nil { + acc.AddError(fmt.Errorf("reading logical CPU count: %w", err)) + continue + } + numPhysicalCPUs, err := cpu.Counts(false) + if err != nil { + acc.AddError(fmt.Errorf("reading physical CPU count: %w", err)) + continue + } + if incl == "cpus" { + fields["n_virtual_cpus"] = numLogicalCPUs + } else { + fields["n_cpus"] = numLogicalCPUs + } + fields["n_physical_cpus"] = numPhysicalCPUs + case "uptime": + uptime, err := host.Uptime() + if err != nil { + acc.AddError(fmt.Errorf("reading uptime: %w", err)) + continue + } + fields["uptime"] = uptime + case "legacy_uptime": + uptime, err := host.Uptime() + if err != nil { + acc.AddError(fmt.Errorf("reading uptime: %w", err)) + continue + } + acc.AddCounter("system", map[string]interface{}{ + "uptime": uptime, + }, nil, now) + acc.AddFields("system", map[string]interface{}{ + "uptime_format": formatUptime(uptime), + }, nil, now) + } } - acc.AddCounter("system", map[string]interface{}{ - "uptime": uptime, - }, nil, now) - acc.AddFields("system", map[string]interface{}{ - "uptime_format": formatUptime(uptime), - }, nil, now) + if len(fields) > 0 { + acc.AddGauge("system", fields, nil, now) + } return nil } @@ -88,7 +173,6 @@ func findUniqueUsers(userStats []host.UserStat) int { uniqueUsers[userstat.User] = true } } - return len(uniqueUsers) } @@ -97,7 +181,6 @@ func formatUptime(uptime uint64) string { w := bufio.NewWriter(buf) days := uptime / (60 * 60 * 24) - if days != 0 { s := "" if days > 1 { diff --git a/plugins/inputs/system/system_test.go b/plugins/inputs/system/system_test.go index 24fe747589da3..4ab4b3115c951 100644 --- a/plugins/inputs/system/system_test.go +++ b/plugins/inputs/system/system_test.go @@ -2,9 +2,14 @@ package system import ( "testing" + "time" "github.com/shirou/gopsutil/v4/host" "github.com/stretchr/testify/require" + + "github.com/influxdata/telegraf" + "github.com/influxdata/telegraf/metric" + "github.com/influxdata/telegraf/testutil" ) func TestUniqueUsers(t *testing.T) { @@ -62,3 +67,242 @@ func TestUniqueUsers(t *testing.T) { }) } } + +func TestInitAllValidOptions(t *testing.T) { + // cpus/legacy_cpus and uptime/legacy_uptime are mutually exclusive, + // so cover all six valid values across two configurations. + tests := []struct { + name string + include []string + }{ + {"new", []string{"load", "users", "cpus", "uptime"}}, + {"legacy", []string{"load", "users", "legacy_cpus", "legacy_uptime"}}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + s := &System{Include: tt.include, Log: &testutil.Logger{}} + require.NoError(t, s.Init()) + }) + } +} + +func TestInitErrors(t *testing.T) { + tests := []struct { + name string + include []string + errMsg string + }{ + { + name: "invalid option", + include: []string{"invalid"}, + errMsg: `invalid 'include' option "invalid"`, + }, + { + name: "cpus mutually exclusive", + include: []string{"cpus", "legacy_cpus"}, + errMsg: "mutually exclusive", + }, + { + name: "uptime mutually exclusive", + include: []string{"uptime", "legacy_uptime"}, + errMsg: "mutually exclusive", + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + s := &System{ + Include: tt.include, + Log: &testutil.Logger{}, + } + require.ErrorContains(t, s.Init(), tt.errMsg) + }) + } +} + +func TestGather(t *testing.T) { + // host.Users() depends on /var/run/utmp which is not available on every + // runner. On Linux we mock it via HOST_VAR; on other platforms we probe + // at runtime and skip relevant cases when the call cannot be satisfied. + usersAvailable := setupUsers(t) + + tests := []struct { + name string + include []string + expected []telegraf.Metric + requireUsers bool + }{ + { + name: "default", + include: nil, + requireUsers: true, + expected: []telegraf.Metric{ + metric.New( + "system", + map[string]string{}, + map[string]interface{}{ + "load1": float64(0), + "load5": float64(0), + "load15": float64(0), + "n_users": 0, + "n_unique_users": 0, + "n_cpus": 0, + "n_physical_cpus": 0, + }, + time.Unix(0, 0), + telegraf.Gauge, + ), + metric.New( + "system", + map[string]string{}, + map[string]interface{}{"uptime": uint64(0)}, + time.Unix(0, 0), + telegraf.Counter, + ), + metric.New( + "system", + map[string]string{}, + map[string]interface{}{"uptime_format": string("")}, + time.Unix(0, 0), + telegraf.Untyped, + ), + }, + }, + { + name: "cpus", + include: []string{"cpus"}, + expected: []telegraf.Metric{ + metric.New( + "system", + map[string]string{}, + map[string]interface{}{ + "n_virtual_cpus": 0, + "n_physical_cpus": 0, + }, + time.Unix(0, 0), + telegraf.Gauge, + ), + }, + }, + { + name: "uptime as gauge field", + include: []string{"uptime"}, + expected: []telegraf.Metric{ + metric.New( + "system", + map[string]string{}, + map[string]interface{}{"uptime": uint64(0)}, + time.Unix(0, 0), + telegraf.Gauge, + ), + }, + }, + { + name: "all new options", + include: []string{"load", "users", "cpus", "uptime"}, + requireUsers: true, + expected: []telegraf.Metric{ + metric.New( + "system", + map[string]string{}, + map[string]interface{}{ + "load1": float64(0), + "load5": float64(0), + "load15": float64(0), + "n_users": 0, + "n_unique_users": 0, + "n_virtual_cpus": 0, + "n_physical_cpus": 0, + "uptime": uint64(0), + }, + time.Unix(0, 0), + telegraf.Gauge, + ), + }, + }, + { + name: "legacy_uptime only", + include: []string{"legacy_uptime"}, + expected: []telegraf.Metric{ + metric.New( + "system", + map[string]string{}, + map[string]interface{}{"uptime": uint64(0)}, + time.Unix(0, 0), + telegraf.Counter, + ), + metric.New( + "system", + map[string]string{}, + map[string]interface{}{"uptime_format": string("")}, + time.Unix(0, 0), + telegraf.Untyped, + ), + }, + }, + { + name: "users only", + include: []string{"users"}, + requireUsers: true, + expected: []telegraf.Metric{ + metric.New( + "system", + map[string]string{}, + map[string]interface{}{ + "n_users": 0, + "n_unique_users": 0, + }, + time.Unix(0, 0), + telegraf.Gauge, + ), + }, + }, + { + name: "duplicates are de-duplicated", + include: []string{"legacy_uptime", "legacy_uptime", "cpus", "cpus"}, + expected: []telegraf.Metric{ + metric.New( + "system", + map[string]string{}, + map[string]interface{}{ + "n_virtual_cpus": 0, + "n_physical_cpus": 0, + }, + time.Unix(0, 0), + telegraf.Gauge, + ), + metric.New( + "system", + map[string]string{}, + map[string]interface{}{"uptime": uint64(0)}, + time.Unix(0, 0), + telegraf.Counter, + ), + metric.New( + "system", + map[string]string{}, + map[string]interface{}{"uptime_format": string("")}, + time.Unix(0, 0), + telegraf.Untyped, + ), + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if tt.requireUsers && !usersAvailable { + t.Skip("host.Users() not mockable on this platform") + } + s := &System{ + Include: tt.include, + Log: &testutil.Logger{}, + } + require.NoError(t, s.Init()) + + var acc testutil.Accumulator + require.NoError(t, s.Gather(&acc)) + + actual := acc.GetTelegrafMetrics() + testutil.RequireMetricsStructureEqual(t, tt.expected, actual, testutil.IgnoreTime(), testutil.SortMetrics()) + }) + } +} diff --git a/plugins/inputs/system/system_users_linux_test.go b/plugins/inputs/system/system_users_linux_test.go new file mode 100644 index 0000000000000..315a30e3d8b7c --- /dev/null +++ b/plugins/inputs/system/system_users_linux_test.go @@ -0,0 +1,24 @@ +//go:build linux + +package system + +import ( + "os" + "path/filepath" + "testing" + + "github.com/stretchr/testify/require" +) + +// setupUsers configures gopsutil to read from an empty synthetic utmp file +// so that host.Users() returns zero users deterministically. Returns true +// to indicate the call is mocked and always available. +func setupUsers(t *testing.T) bool { + t.Helper() + tmpDir := t.TempDir() + runDir := filepath.Join(tmpDir, "run") + require.NoError(t, os.MkdirAll(runDir, 0750)) + require.NoError(t, os.WriteFile(filepath.Join(runDir, "utmp"), nil, 0640)) + t.Setenv("HOST_VAR", tmpDir) + return true +} diff --git a/plugins/inputs/system/system_users_other_test.go b/plugins/inputs/system/system_users_other_test.go new file mode 100644 index 0000000000000..57b963c7e65df --- /dev/null +++ b/plugins/inputs/system/system_users_other_test.go @@ -0,0 +1,18 @@ +//go:build !linux + +package system + +import ( + "testing" + + "github.com/shirou/gopsutil/v4/host" +) + +// setupUsers cannot mock host.Users() on non-Linux platforms because gopsutil +// hardcodes the utmp path or returns ErrNotImplementedError. It probes the +// call at runtime and returns true only if users can actually be read. +func setupUsers(t *testing.T) bool { + t.Helper() + _, err := host.Users() + return err == nil +}