Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 18 additions & 5 deletions deepspeed/monitor/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@


def get_monitor_config(param_dict):
monitor_dict = {key: param_dict.get(key, {}) for key in ("tensorboard", "wandb", "csv_monitor", "comet")}
monitor_dict = {key: param_dict.get(key, {}) for key in ("tensorboard", "wandb", "csv_monitor", "comet", "trackio")}
return DeepSpeedMonitorConfig(**monitor_dict)


Expand All @@ -23,7 +23,7 @@ class TensorBoardConfig(DeepSpeedConfigModel):
output_path: str = ""
"""
Path to where the Tensorboard logs will be written. If not provided, the
output path is set under the training scripts launching path.
output path is set under the training script's launching path.
"""

job_name: str = "DeepSpeedJobName"
Expand Down Expand Up @@ -55,7 +55,7 @@ class CSVConfig(DeepSpeedConfigModel):
output_path: str = ""
"""
Path to where the csv files will be written. If not provided, the output
path is set under the training scripts launching path.
path is set under the training script's launching path.
"""

job_name: str = "DeepSpeedJobName"
Expand Down Expand Up @@ -122,6 +122,16 @@ class CometConfig(DeepSpeedConfigModel):
"""


class TrackioConfig(DeepSpeedConfigModel):
"""Sets parameters for Trackio monitor."""

enabled: bool = False
""" Whether logging to Trackio is enabled. Requires `trackio` package is installed. """

project: str = "deepspeed"
Comment on lines +125 to +131

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Add required Trackio tests and docs

This introduces a new monitoring backend but the diff only changes production code; the workspace AGENTS.md requires new features to include corresponding tests and documentation updates. Please add coverage for the Trackio config/MonitorMaster wiring and logging behavior, and update the monitor docs so users can discover the new trackio configuration.

Useful? React with 👍 / 👎.

""" Name for the Trackio project. """


class DeepSpeedMonitorConfig(DeepSpeedConfigModel):
"""Sets parameters for various monitoring methods."""

Expand All @@ -137,8 +147,11 @@ class DeepSpeedMonitorConfig(DeepSpeedConfigModel):
csv_monitor: CSVConfig = {}
""" Local CSV output of monitoring data. """

trackio: TrackioConfig = {}
""" Trackio monitor, requires `trackio` package is installed. """

@model_validator(mode="after")
def check_enabled(self):
enabled = self.tensorboard.enabled or self.wandb.enabled or self.csv_monitor.enabled or self.comet.enabled
enabled = self.tensorboard.enabled or self.wandb.enabled or self.csv_monitor.enabled or self.comet.enabled or self.trackio.enabled
self.__dict__["enabled"] = enabled
return self
return self
8 changes: 7 additions & 1 deletion deepspeed/monitor/monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# SPDX-License-Identifier: Apache-2.0

# DeepSpeed Team

"""
Support different forms of monitoring such as wandb and tensorboard
"""
Expand All @@ -25,6 +26,7 @@ def write_events(self, event_list):
from .tensorboard import TensorBoardMonitor
from .csv_monitor import csvMonitor
from .comet import CometMonitor
from .trackio import TrackioMonitor


class MonitorMaster(Monitor):
Expand All @@ -35,8 +37,8 @@ def __init__(self, monitor_config):
self.wandb_monitor = None
self.csv_monitor = None
self.comet_monitor = None
self.trackio_monitor = None
self.enabled = monitor_config.enabled

if dist.get_rank() == 0:
if monitor_config.tensorboard.enabled:
self.tb_monitor = TensorBoardMonitor(monitor_config.tensorboard)
Expand All @@ -46,6 +48,8 @@ def __init__(self, monitor_config):
self.csv_monitor = csvMonitor(monitor_config.csv_monitor)
if monitor_config.comet.enabled:
self.comet_monitor = CometMonitor(monitor_config.comet)
if monitor_config.trackio.enabled:
self.trackio_monitor = TrackioMonitor(monitor_config.trackio)

def write_events(self, event_list):
if dist.get_rank() == 0:
Expand All @@ -57,3 +61,5 @@ def write_events(self, event_list):
self.csv_monitor.write_events(event_list)
if self.comet_monitor is not None:
self.comet_monitor.write_events(event_list)
if self.trackio_monitor is not None:
self.trackio_monitor.write_events(event_list)
33 changes: 33 additions & 0 deletions deepspeed/monitor/trackio.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0

# DeepSpeed Team

from .utils import check_trackio_availability
from .monitor import Monitor
import deepspeed.comm as dist


class TrackioMonitor(Monitor):

def __init__(self, trackio_config):
super().__init__(trackio_config)
check_trackio_availability()
import trackio
self.enabled = trackio_config.enabled
self.project = trackio_config.project
if self.enabled and dist.get_rank() == 0:
trackio.init(project=self.project)

def log(self, data, step=None):
if self.enabled and dist.get_rank() == 0:
import trackio
return trackio.log(data, step=step)

def write_events(self, event_list):
if self.enabled and dist.get_rank() == 0:
for event in event_list:
label = event[0]
value = event[1]
step = event[2]
self.log({label: value}, step=step)
12 changes: 10 additions & 2 deletions deepspeed/monitor/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@

def check_tb_availability():
try:
# torch.utils.tensorboard will fail if `tensorboard` is not available,
# see their docs for more details: https://pytorch.org/docs/1.8.0/tensorboard.html
import tensorboard # noqa: F401 # type: ignore
except ImportError:
print('If you want to use tensorboard logging, please `pip install tensorboard`')
Expand All @@ -35,3 +33,13 @@ def check_comet_availability():
except ImportError:
print('If you want to use comet logging, please `pip install "comet_ml>=3.41.0"`')
raise


def check_trackio_availability():
try:
import trackio # noqa: F401 # type: ignore
except ImportError:
print(
'If you want to use Trackio logging, please `pip install trackio` and follow the instructions at https://github.com/huggingface/trackio'
)
raise
10 changes: 8 additions & 2 deletions docs/_tutorials/monitor.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ In this tutorial, we introduce the DeepSpeed Monitor and provide examples of its

## Overview

Monitoring model and system metrics during training is vital to ensure hardware resources are fully utilized. The DeepSpeed Monitor enables live logging of metrics through one or more monitoring backends such as PyTorch's [TensorBoard](https://pytorch.org/docs/1.8.0/tensorboard.html), [WandB](https://docs.wandb.ai/quickstart), [Comet](https://www.comet.com/site/?utm_source=deepseed&utm_medium=docs&utm_content=tutorial) and simple CSV files.
Monitoring model and system metrics during training is vital to ensure hardware resources are fully utilized. The DeepSpeed Monitor enables live logging of metrics through one or more monitoring backends such as PyTorch's [TensorBoard](https://pytorch.org/docs/1.8.0/tensorboard.html), [WandB](https://docs.wandb.ai/quickstart), [Comet](https://www.comet.com/site/?utm_source=deepseed&utm_medium=docs&utm_content=tutorial), [Trackio](https://github.com/huggingface/trackio) and simple CSV files.

Below is a live monitoring view for TensorBoard:

Expand All @@ -25,6 +25,8 @@ Below is a live monitoring view for Comet:

![CometML Example Output](/assets/images/comet_monitor.png){: .align-center}

[Trackio](https://github.com/huggingface/trackio) is a lightweight, offline-first experiment tracking library from Hugging Face with a WandB-compatible API. Runs can be visualized as an HF Space or dataset on the HF Hub.

## Usage

The DeepSpeed Monitor is configured within the deepspeed [configuration file](/docs/config-json/#monitoring-module). DeepSpeed will automatically monitor key training metrics, including those tracked with the `wall_clock_breakdown` configuration option. In addition, users can log their own custom events and metrics.
Expand Down Expand Up @@ -54,6 +56,10 @@ When using DeepSpeed for model training, the Monitor can be configured in the De
"project": "my_project",
"experiment_name": "my_experiment"
}
"trackio": {
"enabled": true,
"project": "my_project"
}
"csv_monitor": {
"enabled": true,
"output_path": "output/ds_logs/",
Expand All @@ -69,7 +75,7 @@ DeepSpeed will automatically log to all available and enabled monitoring backend
In addition to automatic monitoring, users can log their own custom metrics in client scripts. Currently, there are two ways to initialize Monitor objects:

1. (Recommended) - Create a `MonitorMaster(ds_config.monitor_config)` object, which automatically initializes all monitor backends present in the DeepSpeed configuration
2. Create a specific `TensorBoardMonitor(ds_config.monitor_config)`, `WandbMonitor(ds_config.monitor_config)`, `csvMonitor(ds_config.monitor_config)` object which will only initialize a specific monitor backend present in the DeepSpeed configuration
2. Create a specific `TensorBoardMonitor(ds_config.monitor_config)`, `WandbMonitor(ds_config.monitor_config)`, `TrackioMonitor(ds_config.monitor_config)`, `csvMonitor(ds_config.monitor_config)` object which will only initialize a specific monitor backend present in the DeepSpeed configuration


The steps to create a custom monitor are as follows:
Expand Down
86 changes: 85 additions & 1 deletion tests/unit/monitor/test_monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,18 @@

# DeepSpeed Team

import sys

from deepspeed.monitor.tensorboard import TensorBoardMonitor
from deepspeed.monitor.wandb import WandbMonitor
from deepspeed.monitor.csv_monitor import csvMonitor
from deepspeed.monitor.config import DeepSpeedMonitorConfig
from deepspeed.monitor.comet import CometMonitor
from deepspeed.monitor.trackio import TrackioMonitor
from deepspeed.monitor.monitor import MonitorMaster

from unit.common import DistributedTest
from unittest.mock import Mock, patch
from unittest.mock import Mock, MagicMock, patch
from deepspeed.runtime.config import DeepSpeedConfig

import deepspeed.comm as dist
Expand Down Expand Up @@ -164,3 +168,83 @@ def test_empty_comet(self):
assert comet_monitor.enabled == defaults.enabled
assert comet_monitor.samples_log_interval == defaults.samples_log_interval
mock_start.assert_not_called()


class TestTrackio(DistributedTest):
world_size = 2

def test_trackio(self):
# trackio is an optional dependency, so we stub the module rather
# than requiring it to be installed for CI.
mock_trackio = MagicMock()

config_dict = {"train_batch_size": 2, "trackio": {"enabled": True, "project": "my_project"}}
ds_config = DeepSpeedConfig(config_dict)

with patch.dict(sys.modules, {"trackio": mock_trackio}):
trackio_monitor = TrackioMonitor(ds_config.monitor_config.trackio)

assert trackio_monitor.enabled == True
assert trackio_monitor.project == "my_project"

# trackio.init should only be called on rank 0
if dist.get_rank() == 0:
mock_trackio.init.assert_called_once_with(project="my_project")
else:
mock_trackio.init.assert_not_called()

def test_empty_trackio(self):
mock_trackio = MagicMock()

config_dict = {"train_batch_size": 2, "trackio": {}}
ds_config = DeepSpeedConfig(config_dict)

with patch.dict(sys.modules, {"trackio": mock_trackio}):
trackio_monitor = TrackioMonitor(ds_config.monitor_config.trackio)

defaults = DeepSpeedMonitorConfig().trackio
assert trackio_monitor.enabled == defaults.enabled
assert trackio_monitor.project == defaults.project

def test_trackio_write_events(self):
# Verifies write_events() correctly converts 3-tuples into
# trackio.log() calls with the right step value.
mock_trackio = MagicMock()

config_dict = {"train_batch_size": 2, "trackio": {"enabled": True, "project": "my_project"}}
ds_config = DeepSpeedConfig(config_dict)

with patch.dict(sys.modules, {"trackio": mock_trackio}):
trackio_monitor = TrackioMonitor(ds_config.monitor_config.trackio)
events = [("Train/Loss", 0.5, 100)]
trackio_monitor.write_events(events)

if dist.get_rank() == 0:
mock_trackio.log.assert_called_once_with({"Train/Loss": 0.5}, step=100)
else:
mock_trackio.log.assert_not_called()


class TestMonitorMasterTrackioWiring(DistributedTest):
world_size = 2

def test_trackio_enabled_creates_monitor(self):
mock_trackio = MagicMock()

config_dict = {"train_batch_size": 2, "trackio": {"enabled": True, "project": "my_project"}}
ds_config = DeepSpeedConfig(config_dict)

with patch.dict(sys.modules, {"trackio": mock_trackio}):
monitor_master = MonitorMaster(ds_config.monitor_config)

if dist.get_rank() == 0:
assert monitor_master.trackio_monitor is not None
assert isinstance(monitor_master.trackio_monitor, TrackioMonitor)
else:
assert monitor_master.trackio_monitor is None

def test_trackio_disabled_skips_monitor(self):
config_dict = {"train_batch_size": 2, "trackio": {"enabled": False}}
ds_config = DeepSpeedConfig(config_dict)
monitor_master = MonitorMaster(ds_config.monitor_config)
assert monitor_master.trackio_monitor is None
Loading