Skip to content
Merged
49 changes: 46 additions & 3 deletions plugins/inputs/system/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,14 @@ plugin ordering. See [CONFIGURATION.md][CONFIGURATION.md] for more details.
## legacy_cpus - legacy layout of CPU counts; see README for details
## uptime - system uptime
## legacy_uptime - legacy layout of system uptime; see README for details
## os - operating system release and uname information
# include = ["load", "users", "legacy_cpus", "legacy_uptime"]

## How long to cache the result of the "os" group between gathers.
## Set higher to reduce the number of os-release/uname reads, lower to
## surface distro upgrades and kexec'd kernels faster. Set to zero to
## re-read the data on every gather.
# os_cache_ttl = "8h"
```

> [!NOTE]
Expand All @@ -52,12 +59,23 @@ The `n_unique_users` shows the count of unique usernames logged in. This way if
a user has multiple sessions open/started they would only get counted once. The
same requirements for `n_users` apply.

The `os` group reads `/etc/os-release` on Linux (typically world-readable) and
calls the `uname` syscall on POSIX systems. The `os` field is always populated
from Go's runtime, and `arch` falls back to the runtime architecture when the
kernel cannot be queried, so both are always present. On platforms where
gopsutil cannot provide platform release or kernel data (e.g. parts of
FreeBSD/OpenBSD/Solaris) the `platform`, `platform_family`, `platform_version`
and `kernel_version` fields may be empty. Results are cached between gathers,
see `os_cache_ttl` above.

## Metrics

### `system`
The `include` option controls which measurements and fields are gathered.
The `load`, `users`, `cpus` / `legacy_cpus` and `uptime` / `legacy_uptime`
groups populate the `system` measurement, while the `os` group emits a
separate `system_os` measurement.

All fields below belong to the `system` measurement. The `include` option
controls which groups are gathered.
### `system`

| Field | Include option | Type | Description |
|-------------------|----------------------------|---------|---------------------------------------------|
Expand All @@ -73,6 +91,23 @@ controls which groups are gathered.
| `uptime` | `legacy_uptime` | integer | System uptime in seconds (separate counter) |
| `uptime_format` | `legacy_uptime` | string | Human-readable uptime (deprecated) |

### `system_os`

Emitted only when `os` is included. The values reflect operating system
release information together with `uname`-style kernel data. Fields are
reported as strings. The `os` and `arch` fields are always populated; the
`platform`, `platform_family`, `platform_version` and `kernel_version` fields
may be empty on platforms where gopsutil cannot determine them.

| Field | Type | Description |
|--------------------|--------|----------------------------------------------------------------------|
| `os` | string | Operating system family as reported by Go's runtime (e.g. `linux`) |
| `arch` | string | Architecture as returned by `uname -m` (e.g. `x86_64`) |
| `platform` | string | OS distribution / platform identifier (e.g. `ubuntu`, `centos`) |
| `platform_family` | string | Platform family (e.g. `debian`, `rhel`) |
| `platform_version` | string | Platform / distribution version (e.g. `26.04`) |
| `kernel_version` | string | Kernel release as returned by `uname -r` (e.g. `7.0.0-7-generic`) |

## Example Output

### Default configuration
Expand All @@ -94,3 +129,11 @@ in a single metric with the new field names:
```text
system,host=worker-01 load1=3.72,load5=2.4,load15=2.1,n_users=3i,n_unique_users=2i,n_virtual_cpus=4i,n_physical_cpus=2i,uptime=1249632i 1748000000000000000
```

### OS information

With `include = ["os"]`, a separate `system_os` measurement is emitted:

```text
system_os,host=worker-01 os="linux",arch="x86_64",platform="ubuntu",platform_family="debian",platform_version="26.04",kernel_version="7.0.0-7-generic" 1748000000000000000
```
7 changes: 7 additions & 0 deletions plugins/inputs/system/sample.conf
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,11 @@
## legacy_cpus - legacy layout of CPU counts; see README for details
## uptime - system uptime
## legacy_uptime - legacy layout of system uptime; see README for details
## os - operating system release and uname information
# include = ["load", "users", "legacy_cpus", "legacy_uptime"]

## How long to cache the result of the "os" group between gathers.
## Set higher to reduce the number of os-release/uname reads, lower to
## surface distro upgrades and kexec'd kernels faster. Set to zero to
## re-read the data on every gather.
# os_cache_ttl = "8h"
58 changes: 54 additions & 4 deletions plugins/inputs/system/system.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"errors"
"fmt"
"os"
"runtime"
"strings"
"time"

Expand All @@ -24,8 +25,12 @@ import (
var sampleConfig string

type System struct {
Include []string `toml:"include"`
Log telegraf.Logger `toml:"-"`
Include []string `toml:"include"`
OSCacheTTL config.Duration `toml:"os_cache_ttl"`
Log telegraf.Logger `toml:"-"`

osCache map[string]interface{}
osCachedAt time.Time
}

func (*System) SampleConfig() string {
Expand All @@ -46,7 +51,7 @@ func (s *System) Init() error {
continue
}
switch incl {
case "load", "users", "cpus", "uptime":
case "load", "users", "cpus", "uptime", "os":
case "legacy_cpus":
if userSupplied {
config.PrintOptionValueDeprecationNotice(
Expand Down Expand Up @@ -97,6 +102,19 @@ func (s *System) Gather(acc telegraf.Accumulator) error {

for _, incl := range s.Include {
switch incl {
case "os":
if time.Since(s.osCachedAt) > time.Duration(s.OSCacheTTL) {
osCache, err := gatherOS()
if err != nil {
acc.AddError(err)
} else {
s.osCache = osCache
s.osCachedAt = now
}
}
if len(s.osCache) > 0 {
acc.AddFields("system_os", s.osCache, nil, now)
}
case "load":
loadavg, err := load.Avg()
if err != nil {
Expand Down Expand Up @@ -166,6 +184,36 @@ func (s *System) Gather(acc telegraf.Accumulator) error {
return nil
}

// gatherOS reads OS release and uname information via gopsutil, skipping
// host.Info() to avoid the unrelated virtualization, boot-time and
// process-count probes.
func gatherOS() (map[string]interface{}, error) {
platform, family, version, err := host.PlatformInformation()
if err != nil && !strings.Contains(err.Error(), "not implemented") {
return nil, fmt.Errorf("reading platform information: %w", err)
}
kernelVersion, err := host.KernelVersion()
if err != nil && !strings.Contains(err.Error(), "not implemented") {
return nil, fmt.Errorf("reading kernel version: %w", err)
}
arch, err := host.KernelArch()
if err != nil && !strings.Contains(err.Error(), "not implemented") {
return nil, fmt.Errorf("reading kernel architecture: %w", err)
}
if arch == "" {
arch = runtime.GOARCH
}

return map[string]interface{}{
"os": runtime.GOOS,
"arch": arch,
"platform": platform,
"platform_family": family,
"platform_version": version,
"kernel_version": kernelVersion,
}, nil
}

func findUniqueUsers(userStats []host.UserStat) int {
uniqueUsers := make(map[string]bool)
for _, userstat := range userStats {
Expand Down Expand Up @@ -201,6 +249,8 @@ func formatUptime(uptime uint64) string {

func init() {
inputs.Add("system", func() telegraf.Input {
return &System{}
return &System{
OSCacheTTL: config.Duration(8 * time.Hour),
}
})
}
55 changes: 53 additions & 2 deletions plugins/inputs/system/system_test.go
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
package system

import (
"path/filepath"
"runtime"
"testing"
"time"

"github.com/shirou/gopsutil/v4/host"
"github.com/stretchr/testify/require"

"github.com/influxdata/telegraf"
"github.com/influxdata/telegraf/config"
"github.com/influxdata/telegraf/metric"
"github.com/influxdata/telegraf/testutil"
)
Expand Down Expand Up @@ -75,8 +78,8 @@ func TestInitAllValidOptions(t *testing.T) {
name string
include []string
}{
{"new", []string{"load", "users", "cpus", "uptime"}},
{"legacy", []string{"load", "users", "legacy_cpus", "legacy_uptime"}},
{"new", []string{"load", "users", "cpus", "uptime", "os"}},
{"legacy", []string{"load", "users", "legacy_cpus", "legacy_uptime", "os"}},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
Expand Down Expand Up @@ -306,3 +309,51 @@ func TestGather(t *testing.T) {
})
}
}

func TestGatherOSValues(t *testing.T) {
if runtime.GOOS != "linux" {
t.Skip("Skipping test on non-Linux setups...")
}

etcDir, err := filepath.Abs(filepath.Join("testdata", "os-release"))
require.NoError(t, err)
t.Setenv("HOST_ETC", etcDir)

s := &System{
Include: []string{"os"},
OSCacheTTL: config.Duration(8 * time.Hour),
Log: &testutil.Logger{},
}
require.NoError(t, s.Init())

var acc testutil.Accumulator
require.NoError(t, s.Gather(&acc))

// arch and kernel_version come from uname(2) and depend on the host.
expected := []telegraf.Metric{
metric.New(
"system_os",
map[string]string{},
map[string]interface{}{
"os": "linux",
"platform": "telegraftest",
"platform_family": "",
"platform_version": "1.0",
},
time.Unix(0, 0),
telegraf.Untyped,
),
}

actual := acc.GetTelegrafMetrics()
testutil.RequireMetricsEqual(t, expected, actual,
testutil.IgnoreTime(), testutil.IgnoreFields("arch", "kernel_version"))

require.Len(t, actual, 1)
arch, ok := actual[0].GetField("arch")
require.True(t, ok)
require.NotEmpty(t, arch)
kernelVersion, ok := actual[0].GetField("kernel_version")
require.True(t, ok)
require.NotEmpty(t, kernelVersion)
}
4 changes: 4 additions & 0 deletions plugins/inputs/system/testdata/os-release/os-release
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
NAME="Telegraf Test OS"
ID=telegraftest
VERSION_ID="1.0"
PRETTY_NAME="Telegraf Test OS 1.0"
Loading