Skip to content
48 changes: 45 additions & 3 deletions plugins/inputs/system/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,16 @@ plugin ordering. See [CONFIGURATION.md][CONFIGURATION.md] for more details.
## legacy_cpus - legacy layout of CPU counts; see README for details
## uptime - system uptime
## legacy_uptime - legacy layout of system uptime; see README for details
## os - operating system release and uname information
# include = ["load", "users", "legacy_cpus", "legacy_uptime"]

## How long to cache the result of the "os" group between gathers.
## Set higher to reduce the number of os-release/uname reads, lower to
## surface distro upgrades and kexec'd kernels faster. A value of zero
## ("0s") caches the values until telegraf restarts; only safe on hosts
## that are not re-imaged or kexec'd at runtime. To re-read on every
## gather, set to a very small positive value such as "1ns".
# os_cache_ttl = "5m"
```

> [!NOTE]
Expand All @@ -52,12 +61,21 @@ The `n_unique_users` shows the count of unique usernames logged in. This way if
a user has multiple sessions open/started they would only get counted once. The
same requirements for `n_users` apply.

The `os` group reads `/etc/os-release` on Linux (typically world-readable) and
calls the `uname` syscall on POSIX systems. On platforms where gopsutil cannot
provide a particular value (e.g. parts of FreeBSD/OpenBSD/Solaris) the
corresponding field is left empty; if no field can be gathered, the
`system_os` metric is skipped entirely. Results are cached between gathers,
Comment thread
bilkoua marked this conversation as resolved.
Outdated
see `os_cache_ttl` above.

## Metrics

### `system`
The `include` option controls which measurements and fields are gathered.
The `load`, `users`, `cpus` / `legacy_cpus` and `uptime` / `legacy_uptime`
groups populate the `system` measurement, while the `os` group emits a
separate `system_os` measurement.

All fields below belong to the `system` measurement. The `include` option
controls which groups are gathered.
### `system`

| Field | Include option | Type | Description |
|-------------------|----------------------------|---------|---------------------------------------------|
Expand All @@ -73,6 +91,22 @@ controls which groups are gathered.
| `uptime` | `legacy_uptime` | integer | System uptime in seconds (separate counter) |
| `uptime_format` | `legacy_uptime` | string | Human-readable uptime (deprecated) |

### `system_os`

Emitted only when `os` is included. The values reflect operating system
release information together with `uname`-style kernel data. Fields are
reported as strings; on platforms where a particular value cannot be
determined the corresponding field is empty.

| Field | Type | Description |
|--------------------|--------|----------------------------------------------------------------------|
| `os` | string | Operating system family as reported by Go's runtime (e.g. `linux`) |
| `platform` | string | OS distribution / platform identifier (e.g. `ubuntu`, `centos`) |
| `platform_family` | string | Platform family (e.g. `debian`, `rhel`) |
| `platform_version` | string | Platform / distribution version (e.g. `26.04`) |
| `kernel_version` | string | Kernel release as returned by `uname -r` (e.g. `7.0.0-7-generic`) |
| `kernel_arch` | string | Kernel architecture as returned by `uname -m` (e.g. `x86_64`) |
Comment thread
bilkoua marked this conversation as resolved.
Outdated

## Example Output

### Default configuration
Expand All @@ -94,3 +128,11 @@ in a single metric with the new field names:
```text
system,host=worker-01 load1=3.72,load5=2.4,load15=2.1,n_users=3i,n_unique_users=2i,n_virtual_cpus=4i,n_physical_cpus=2i,uptime=1249632i 1748000000000000000
```

### OS information

With `include = ["os"]`, a separate `system_os` measurement is emitted:

```text
system_os,host=worker-01 os="linux",platform="ubuntu",platform_family="debian",platform_version="26.04",kernel_version="7.0.0-7-generic",kernel_arch="x86_64" 1748000000000000000
```
9 changes: 9 additions & 0 deletions plugins/inputs/system/sample.conf
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,13 @@
## legacy_cpus - legacy layout of CPU counts; see README for details
## uptime - system uptime
## legacy_uptime - legacy layout of system uptime; see README for details
## os - operating system release and uname information
# include = ["load", "users", "legacy_cpus", "legacy_uptime"]

## How long to cache the result of the "os" group between gathers.
## Set higher to reduce the number of os-release/uname reads, lower to
## surface distro upgrades and kexec'd kernels faster. A value of zero
## ("0s") caches the values until telegraf restarts; only safe on hosts
## that are not re-imaged or kexec'd at runtime. To re-read on every
## gather, set to a very small positive value such as "1ns".
# os_cache_ttl = "5m"
63 changes: 59 additions & 4 deletions plugins/inputs/system/system.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"errors"
"fmt"
"os"
"runtime"
"strings"
"time"

Expand All @@ -23,9 +24,15 @@ import (
//go:embed sample.conf
var sampleConfig string

const defaultOSCacheTTL = 5 * time.Minute

type System struct {
Include []string `toml:"include"`
Log telegraf.Logger `toml:"-"`
Include []string `toml:"include"`
OSCacheTTL config.Duration `toml:"os_cache_ttl"`
Log telegraf.Logger `toml:"-"`

osFields map[string]interface{}
Comment thread
bilkoua marked this conversation as resolved.
Outdated
osCachedAt time.Time
}

func (*System) SampleConfig() string {
Expand All @@ -46,7 +53,7 @@ func (s *System) Init() error {
continue
}
switch incl {
case "load", "users", "cpus", "uptime":
case "load", "users", "cpus", "uptime", "os":
case "legacy_cpus":
if userSupplied {
config.PrintOptionValueDeprecationNotice(
Expand Down Expand Up @@ -97,6 +104,20 @@ func (s *System) Gather(acc telegraf.Accumulator) error {

for _, incl := range s.Include {
switch incl {
case "os":
ttl := time.Duration(s.OSCacheTTL)
expired := ttl > 0 && now.Sub(s.osCachedAt) >= ttl
if s.osCachedAt.IsZero() || expired {
Comment thread
bilkoua marked this conversation as resolved.
Outdated
osFields, err := gatherOS()
if err != nil {
acc.AddError(err)
}
s.osFields = osFields
s.osCachedAt = now
Comment thread
bilkoua marked this conversation as resolved.
Outdated
}
if len(s.osFields) > 0 {
acc.AddFields("system_os", s.osFields, nil, now)
}
case "load":
loadavg, err := load.Avg()
if err != nil {
Expand Down Expand Up @@ -166,6 +187,38 @@ func (s *System) Gather(acc telegraf.Accumulator) error {
return nil
}

// gatherOS reads OS release and uname information via gopsutil, skipping
// host.Info() to avoid the unrelated virtualization, boot-time and
// process-count probes.
func gatherOS() (map[string]interface{}, error) {
var errs []error

platform, family, version, err := host.PlatformInformation()
if err != nil && !strings.Contains(err.Error(), "not implemented") {
errs = append(errs, fmt.Errorf("reading platform information: %w", err))
}
kernelVersion, err := host.KernelVersion()
if err != nil && !strings.Contains(err.Error(), "not implemented") {
errs = append(errs, fmt.Errorf("reading kernel version: %w", err))
}
kernelArch, err := host.KernelArch()
if err != nil && !strings.Contains(err.Error(), "not implemented") {
errs = append(errs, fmt.Errorf("reading kernel architecture: %w", err))
}

if platform == "" && family == "" && version == "" && kernelVersion == "" && kernelArch == "" {
return nil, errors.Join(errs...)
}
return map[string]interface{}{
"os": runtime.GOOS,
"platform": platform,
"platform_family": family,
"platform_version": version,
"kernel_version": kernelVersion,
"kernel_arch": kernelArch,
}, errors.Join(errs...)
Comment thread
bilkoua marked this conversation as resolved.
Outdated
}

func findUniqueUsers(userStats []host.UserStat) int {
uniqueUsers := make(map[string]bool)
for _, userstat := range userStats {
Expand Down Expand Up @@ -201,6 +254,8 @@ func formatUptime(uptime uint64) string {

func init() {
inputs.Add("system", func() telegraf.Input {
return &System{}
return &System{
OSCacheTTL: config.Duration(defaultOSCacheTTL),
Comment thread
bilkoua marked this conversation as resolved.
Outdated
}
})
}
163 changes: 163 additions & 0 deletions plugins/inputs/system/system_os_linux_test.go
Comment thread
bilkoua marked this conversation as resolved.
Outdated
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
//go:build linux

package system

import (
"os"
"path/filepath"
"testing"
"time"

"github.com/stretchr/testify/require"

"github.com/influxdata/telegraf/config"
"github.com/influxdata/telegraf/testutil"
)

const testOSRelease = `NAME="Telegraf Test OS"
ID=telegraftest
VERSION_ID="1.0"
PRETTY_NAME="Telegraf Test OS 1.0"
`

// setupOS points gopsutil at a synthetic os-release file via HOST_ETC.
// Kernel fields still come from the live uname syscall.
func setupOS(t testing.TB) bool {
t.Helper()
mockOSRelease(t, testOSRelease)
return true
}

func mockOSRelease(t testing.TB, content string) {
t.Helper()
etcDir := os.Getenv("HOST_ETC")
if etcDir == "" {
etcDir = filepath.Join(t.TempDir(), "etc")
require.NoError(t, os.MkdirAll(etcDir, 0750))
t.Setenv("HOST_ETC", etcDir)
}
writeOSRelease(t, etcDir, content)
}

func writeOSRelease(t testing.TB, etcDir, content string) {
t.Helper()
require.NoError(t, os.WriteFile(filepath.Join(etcDir, "os-release"), []byte(content), 0640))
}
Comment thread
bilkoua marked this conversation as resolved.
Outdated

func newOSPlugin(ttl time.Duration) *System {
return &System{
Include: []string{"os"},
OSCacheTTL: config.Duration(ttl),
Log: &testutil.Logger{},
}
}
Comment thread
bilkoua marked this conversation as resolved.
Outdated

func TestGatherOSValuesLinux(t *testing.T) {
setupOS(t)

s := newOSPlugin(defaultOSCacheTTL)
require.NoError(t, s.Init())

var acc testutil.Accumulator
require.NoError(t, s.Gather(&acc))

m, found := acc.Get("system_os")
require.True(t, found, "system_os metric not produced")

require.Equal(t, "linux", m.Fields["os"])
require.Equal(t, "telegraftest", m.Fields["platform"])
require.Empty(t, m.Fields["platform_family"])
require.Equal(t, "1.0", m.Fields["platform_version"])
require.IsType(t, "", m.Fields["kernel_version"])
require.NotEmpty(t, m.Fields["kernel_version"])
require.IsType(t, "", m.Fields["kernel_arch"])
require.NotEmpty(t, m.Fields["kernel_arch"])
Comment thread
bilkoua marked this conversation as resolved.
Outdated
}

func TestGatherOSMissingOSReleaseLinux(t *testing.T) {
t.Setenv("HOST_ETC", t.TempDir())

s := newOSPlugin(defaultOSCacheTTL)
require.NoError(t, s.Init())

var acc testutil.Accumulator
require.NoError(t, s.Gather(&acc))

m, found := acc.Get("system_os")
require.True(t, found, "uname syscall always populates kernel fields on Linux")
require.Empty(t, m.Fields["platform"])
require.Empty(t, m.Fields["platform_family"])
require.Empty(t, m.Fields["platform_version"])
require.NotEmpty(t, m.Fields["kernel_version"])
require.NotEmpty(t, m.Fields["kernel_arch"])
}

func TestGatherOSCacheLinux(t *testing.T) {
tests := []struct {
name string
ttl time.Duration
sleep time.Duration
expectedPlatform string
}{
{
name: "refresh after expiry",
ttl: time.Millisecond,
sleep: 5 * time.Millisecond,
expectedPlatform: "upgraded",
},
{
name: "forever with zero ttl",
ttl: 0,
sleep: 5 * time.Millisecond,
expectedPlatform: "telegraftest",
},
{
name: "refresh on every gather with tiny ttl",
ttl: time.Nanosecond,
expectedPlatform: "upgraded",
},
{
name: "served from cache within positive ttl",
ttl: defaultOSCacheTTL,
expectedPlatform: "telegraftest",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
setupOS(t)

s := newOSPlugin(tt.ttl)
require.NoError(t, s.Init())

var acc testutil.Accumulator
require.NoError(t, s.Gather(&acc))

mockOSRelease(t, "ID=upgraded\nVERSION_ID=\"2.0\"\n")
if tt.sleep > 0 {
time.Sleep(tt.sleep)
}

acc.ClearMetrics()
require.NoError(t, s.Gather(&acc))

m, found := acc.Get("system_os")
require.True(t, found)
require.Equal(t, tt.expectedPlatform, m.Fields["platform"])
})
}
}

func BenchmarkGatherOS(b *testing.B) {
setupOS(b)

s := newOSPlugin(defaultOSCacheTTL)
require.NoError(b, s.Init())

var acc testutil.Accumulator
for b.Loop() {
acc.ClearMetrics()
if err := s.Gather(&acc); err != nil {
b.Fatal(err)
}
}
}
Comment thread
bilkoua marked this conversation as resolved.
Outdated
18 changes: 18 additions & 0 deletions plugins/inputs/system/system_os_other_test.go
Comment thread
bilkoua marked this conversation as resolved.
Outdated
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
//go:build !linux

package system

import (
"testing"

"github.com/shirou/gopsutil/v4/host"
)

// setupOS cannot mock the os-group calls on non-Linux platforms because
// gopsutil reads from native APIs. Probe at runtime instead and return
// true only if the call succeeds.
func setupOS(t testing.TB) bool {
t.Helper()
_, err := host.KernelVersion()
return err == nil
}
Loading
Loading