Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 55 additions & 15 deletions plugins/inputs/system/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,28 @@ plugin ordering. See [CONFIGURATION.md][CONFIGURATION.md] for more details.
```toml @sample.conf
# Read metrics about system load & uptime
[[inputs.system]]
# no configuration
## Information to collect; available options are:
## load - 1, 5 and 15-minute load averages
## users - logged-in user counts
## cpus - CPU counts of the system
## legacy_cpus - legacy layout of CPU counts; see README for details
## uptime - system uptime
## legacy_uptime - legacy layout of system uptime; see README for details
# include = ["load", "users", "legacy_cpus", "legacy_uptime"]
```

> [!NOTE]
> The `cpus` and `legacy_cpus` options are mutually exclusive,
> as are `uptime` and `legacy_uptime`.

<!-- markdownlint-disable-next-line MD028 -->

> [!IMPORTANT]
> Switching from `legacy_uptime` to `uptime` changes the Prometheus metric
> type of `system_uptime` from **counter** to **gauge**. If your dashboards
> or alerts use `rate()` or `increase()` on `system_uptime`, update them
> before migrating.

### Permissions

The `n_users` field requires read access to `/var/run/utmp`, and may require the
Expand All @@ -35,22 +54,43 @@ same requirements for `n_users` apply.

## Metrics

- system
- fields:
- load1 (float)
- load15 (float)
- load5 (float)
- n_users (integer)
- n_unique_users (integer)
- n_cpus (integer)
- n_physical_cpus (integer)
- uptime (integer, seconds)
- uptime_format (string, deprecated in 1.10, use `uptime` field)
### `system`

All fields below belong to the `system` measurement. The `include` option
controls which groups are gathered.

| Field | Include option | Type | Description |
|-------------------|----------------------------|---------|---------------------------------------------|
| `load1` | `load` | float | 1-minute load average |
| `load5` | `load` | float | 5-minute load average |
| `load15` | `load` | float | 15-minute load average |
| `n_users` | `users` | integer | Number of logged-in user sessions |
| `n_unique_users` | `users` | integer | Number of unique logged-in usernames |
| `n_virtual_cpus` | `cpus` | integer | Number of logical CPUs |
| `n_cpus` | `legacy_cpus` | integer | Number of logical CPUs (legacy name) |
| `n_physical_cpus` | `cpus` / `legacy_cpus` | integer | Number of physical CPUs |
| `uptime` | `uptime` | integer | System uptime in seconds (gauge field) |
| `uptime` | `legacy_uptime` | integer | System uptime in seconds (separate counter) |
| `uptime_format` | `legacy_uptime` | string | Human-readable uptime (deprecated) |

## Example Output

### Default configuration

With the default `include = ["load", "users", "legacy_cpus", "legacy_uptime"]`,
the output is backward-compatible with previous versions:

```text
system,host=worker-01 load1=3.72,load5=2.4,load15=2.1,n_users=3i,n_unique_users=2i,n_cpus=4i,n_physical_cpus=2i 1748000000000000000
system,host=worker-01 uptime=1249632i 1748000000000000000
system,host=worker-01 uptime_format="14 days, 11:07" 1748000000000000000
```

### Recommended configuration

With `include = ["load", "users", "cpus", "uptime"]`, all fields are emitted
in a single metric with the new field names:

```text
system,host=tyrion load1=3.72,load5=2.4,load15=2.1,n_users=3i,n_cpus=4i,n_physical_cpus=2i 1483964144000000000
system,host=tyrion uptime=1249632i 1483964144000000000
system,host=tyrion uptime_format="14 days, 11:07" 1483964144000000000
system,host=worker-01 load1=3.72,load5=2.4,load15=2.1,n_users=3i,n_unique_users=2i,n_virtual_cpus=4i,n_physical_cpus=2i,uptime=1249632i 1748000000000000000
```
9 changes: 8 additions & 1 deletion plugins/inputs/system/sample.conf
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
# Read metrics about system load & uptime
[[inputs.system]]
# no configuration
## Information to collect; available options are:
## load - 1, 5 and 15-minute load averages
## users - logged-in user counts
## cpus - CPU counts of the system
## legacy_cpus - legacy layout of CPU counts; see README for details
## uptime - system uptime
## legacy_uptime - legacy layout of system uptime; see README for details
# include = ["load", "users", "legacy_cpus", "legacy_uptime"]
163 changes: 123 additions & 40 deletions plugins/inputs/system/system.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"bufio"
"bytes"
_ "embed"
"errors"
"fmt"
"os"
"strings"
Expand All @@ -15,68 +16,152 @@ import (
"github.com/shirou/gopsutil/v4/load"

"github.com/influxdata/telegraf"
"github.com/influxdata/telegraf/config"
"github.com/influxdata/telegraf/plugins/inputs"
)

//go:embed sample.conf
var sampleConfig string

type System struct {
Log telegraf.Logger `toml:"-"`
Include []string `toml:"include"`
Log telegraf.Logger `toml:"-"`
}

func (*System) SampleConfig() string {
return sampleConfig
}

func (s *System) Gather(acc telegraf.Accumulator) error {
loadavg, err := load.Avg()
if err != nil && !strings.Contains(err.Error(), "not implemented") {
return err
func (s *System) Init() error {
// Suppress deprecation warnings for default-only configs.
userSupplied := len(s.Include) > 0
if !userSupplied {
s.Include = []string{"load", "users", "legacy_cpus", "legacy_uptime"}
}

numLogicalCPUs, err := cpu.Counts(true)
if err != nil {
return err
enabled := make(map[string]bool, len(s.Include))
deduped := make([]string, 0, len(s.Include))
for _, incl := range s.Include {
if enabled[incl] {
continue
}
switch incl {
case "load", "users", "cpus", "uptime":
case "legacy_cpus":
if userSupplied {
config.PrintOptionValueDeprecationNotice(
"inputs.system",
"include",
"legacy_cpus",
telegraf.DeprecationInfo{
Since: "1.39.0",
RemovalIn: "1.45.0",
Notice: "use 'cpus' instead",
},
)
}
case "legacy_uptime":
if userSupplied {
config.PrintOptionValueDeprecationNotice(
"inputs.system",
"include",
"legacy_uptime",
telegraf.DeprecationInfo{
Since: "1.39.0",
RemovalIn: "1.45.0",
Notice: "use 'uptime' instead",
},
)
}
default:
return fmt.Errorf("invalid 'include' option %q", incl)
}
enabled[incl] = true
deduped = append(deduped, incl)
}
s.Include = deduped

numPhysicalCPUs, err := cpu.Counts(false)
if err != nil {
return err
if enabled["cpus"] && enabled["legacy_cpus"] {
return errors.New(`"cpus" and "legacy_cpus" are mutually exclusive`)
}

fields := map[string]interface{}{
"load1": loadavg.Load1,
"load5": loadavg.Load5,
"load15": loadavg.Load15,
"n_cpus": numLogicalCPUs,
"n_physical_cpus": numPhysicalCPUs,
if enabled["uptime"] && enabled["legacy_uptime"] {
return errors.New(`"uptime" and "legacy_uptime" are mutually exclusive`)
}

users, err := host.Users()
if err == nil {
fields["n_users"] = len(users)
fields["n_unique_users"] = findUniqueUsers(users)
} else if os.IsNotExist(err) {
s.Log.Debugf("Reading users: %s", err.Error())
} else if os.IsPermission(err) {
s.Log.Debug(err.Error())
}
return nil
}

func (s *System) Gather(acc telegraf.Accumulator) error {
now := time.Now()
acc.AddGauge("system", fields, nil, now)

uptime, err := host.Uptime()
if err != nil {
return err
fields := make(map[string]interface{}, 8)

for _, incl := range s.Include {
switch incl {
case "load":
loadavg, err := load.Avg()
if err != nil {
if !strings.Contains(err.Error(), "not implemented") {
acc.AddError(fmt.Errorf("reading load averages: %w", err))
}
continue
}
fields["load1"] = loadavg.Load1
fields["load5"] = loadavg.Load5
fields["load15"] = loadavg.Load15
case "users":
users, err := host.Users()
if err == nil {
fields["n_users"] = len(users)
fields["n_unique_users"] = findUniqueUsers(users)
} else if os.IsNotExist(err) {
s.Log.Debugf("Reading users: %s", err.Error())
} else if os.IsPermission(err) {
s.Log.Debug(err.Error())
} else {
s.Log.Warnf("Reading users: %s", err.Error())
}
case "cpus", "legacy_cpus":
numLogicalCPUs, err := cpu.Counts(true)
if err != nil {
acc.AddError(fmt.Errorf("reading logical CPU count: %w", err))
continue
}
numPhysicalCPUs, err := cpu.Counts(false)
if err != nil {
acc.AddError(fmt.Errorf("reading physical CPU count: %w", err))
continue
}
if incl == "cpus" {
fields["n_virtual_cpus"] = numLogicalCPUs
} else {
fields["n_cpus"] = numLogicalCPUs
}
fields["n_physical_cpus"] = numPhysicalCPUs
case "uptime":
uptime, err := host.Uptime()
if err != nil {
acc.AddError(fmt.Errorf("reading uptime: %w", err))
continue
}
fields["uptime"] = uptime
case "legacy_uptime":
uptime, err := host.Uptime()
if err != nil {
acc.AddError(fmt.Errorf("reading uptime: %w", err))
continue
}
acc.AddCounter("system", map[string]interface{}{
"uptime": uptime,
}, nil, now)
acc.AddFields("system", map[string]interface{}{
"uptime_format": formatUptime(uptime),
}, nil, now)
}
}

acc.AddCounter("system", map[string]interface{}{
"uptime": uptime,
}, nil, now)
acc.AddFields("system", map[string]interface{}{
"uptime_format": formatUptime(uptime),
}, nil, now)
if len(fields) > 0 {
acc.AddGauge("system", fields, nil, now)
}

return nil
}
Expand All @@ -88,7 +173,6 @@ func findUniqueUsers(userStats []host.UserStat) int {
uniqueUsers[userstat.User] = true
}
}

return len(uniqueUsers)
}

Expand All @@ -97,7 +181,6 @@ func formatUptime(uptime uint64) string {
w := bufio.NewWriter(buf)

days := uptime / (60 * 60 * 24)

if days != 0 {
s := ""
if days > 1 {
Expand Down
Loading
Loading