Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/scripts/fbgemm_gpu_test.bash
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ __configure_fbgemm_gpu_test_rocm () {

# AMD GPUs need to be explicitly made visible to PyTorch for use
# shellcheck disable=SC2155,SC2126
local num_gpus=$(rocm-smi --showproductname | grep GUID | wc -l)
local num_gpus=$(amd-smi list | grep -c "^GPU")
# shellcheck disable=SC2155
local gpu_indices=$(seq 0 $((num_gpus - 1)) | paste -sd, -)
# shellcheck disable=SC2086
Expand Down
7 changes: 3 additions & 4 deletions .github/scripts/utils_rocm.bash
Original file line number Diff line number Diff line change
Expand Up @@ -100,10 +100,9 @@ install_rocm_ubuntu () {
echo "[INSTALL] Cleaning up ..."
print_exec rm -f "${package_name}"

echo "[INFO] Printing ROCM utilities info ..."
# If rocm-smi is installed on a machine without GPUs, this will return error
(print_exec rocminfo) || true
(print_exec rocm-smi) || true
echo "[INFO] Printing AMD-SMI utilities info ..."
# If amd-smi is installed on a machine without GPUs, this will return error
(print_exec amd-smi) || true
(print_exec hipcc -v) || true

echo "[INSTALL] Successfully installed ROCm ${rocm_version}"
Expand Down
19 changes: 6 additions & 13 deletions .github/scripts/utils_system.bash
Original file line number Diff line number Diff line change
Expand Up @@ -173,26 +173,19 @@ print_gpu_info () {
(lspci -v | grep -e 'Display controller: Advanced') || true

if [[ "${ENFORCE_ROCM_DEVICE}" ]]; then
# Ensure that rocm-smi is available and returns GPU entries
if ! rocm-smi; then
# Ensure that amd-smi is available and returns GPU entries
if ! amd-smi; then
echo "[CHECK] ROCm drivers and ROCm device(s) are required for this workflow, but does not appear to be installed or available!"
return 1
fi

else
if which rocm-smi; then
echo "[CHECK] rocm-smi found; printing info ..."
if which amd-smi; then
echo "[CHECK] amd-smi found; printing info ..."
# If the program is installed on a machine without GPUs, invoking it will return error
(print_exec rocm-smi --showproductname) || true
(print_exec amd-smi --showproductname) || true
else
echo "[CHECK] rocm-smi not found"
fi

if which rocminfo; then
echo "[CHECK] rocminfo found; printing info ..."
(print_exec rocminfo) || true
else
echo "[CHECK] rocminfo not found"
echo "[CHECK] amd-smi not found"
fi
fi
}
Expand Down
30 changes: 16 additions & 14 deletions ci/utils/gpu_detect.bash
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
detect_gpu_vendor() {
if command -v nvidia-smi &> /dev/null; then
echo "nvidia"
elif command -v rocm-smi &> /dev/null; then
elif command -v amd-smi &> /dev/null; then
echo "amd"
else
echo ""
Expand Down Expand Up @@ -111,28 +111,28 @@ detect_nvidia_gpu_model() {

# Detect the GPU model of the first AMD GPU.
#
# This function queries rocm-smi for the GFX Version and maps it to a known
# This function queries amd-smi for the GFX Version and maps it to a known
# GPU model name using the AMD_GFX_MODEL_MAP associative array.
# The returned model name is always lowercased.
#
# Returns:
# Lowercased GPU model (e.g., "mi300", "mi350", "mi250")
# "" - if rocm-smi is not available or no GPU is detected
# "" - if amd-smi is not available or no GPU is detected
#
# Usage:
# source gpu.bash
# model=$(detect_amd_gpu_model)
# echo "GPU model: $model" # e.g., "mi350"
#
detect_amd_gpu_model() {
# Check if rocm-smi is available
if ! command -v rocm-smi &> /dev/null; then
echo "rocm-smi not found; cannot detect AMD GPU model" >&2
# Check if amd-smi is available
if ! command -v amd-smi &> /dev/null; then
echo "amd-smi not found; cannot detect AMD GPU model" >&2
return 1
fi

# Associative array mapping GFX versions to GPU model names.
# Keys are the GFX versions (from "GFX Version" field in rocm-smi --showproductname).
# Keys are the GFX versions (from "TARGET_GRAPHICS_VERSION" field in amd-smi static --asic).
# Values are the desired lowercased model names.
#
# Target architecture, card model, and ROCm compatibility tables can be found
Expand All @@ -142,7 +142,7 @@ detect_amd_gpu_model() {
# https://www.coelacanth-dream.com/posts/2019/12/30/did-rid-product-matome-p2/
#
# To find the GFX version for a new GPU, run:
# rocm-smi --showproductname | grep "GFX Version"
# amd-smi static --asic | grep "TARGET_GRAPHICS_VERSION"
#
declare -A AMD_GFX_MODEL_MAP=(
# MI350 series (CDNA 4)
Expand All @@ -157,11 +157,13 @@ detect_amd_gpu_model() {
["gfx906"]="mi50"
)

# Get the GFX Version from rocm-smi (first GPU only)
# rocm-smi --showproductname outputs something like:
# GPU[0] : GFX Version: gfx950
# Get the GFX Version from amd-smi (first GPU only)
# amd-smi static --asic outputs something like:
# GPU: 0
# ASIC:
# TARGET_GRAPHICS_VERSION: gfx950
local gfx_version
gfx_version=$(rocm-smi --showproductname 2>/dev/null | grep -m1 "GFX Version:" | sed 's/.*GFX Version:[[:space:]]*//' | xargs)
gfx_version=$(amd-smi static --asic 2>/dev/null | grep -m1 "TARGET_GRAPHICS_VERSION:" | sed 's/.*TARGET_GRAPHICS_VERSION:[[:space:]]*//' | xargs)

if [[ -z "$gfx_version" ]]; then
echo "Could not detect AMD GPU GFX version" >&2
Expand Down Expand Up @@ -269,7 +271,7 @@ detect_gpu_count() {
if [[ "${vendor}" == "nvidia" ]]; then
nvidia-smi --query-gpu=index --format=csv,noheader 2>/dev/null | wc -l
elif [[ "${vendor}" == "amd" ]]; then
rocm-smi --showid 2>/dev/null | grep -oP "GPU\[\K[0-9]+" | sort -u | wc -l
amd-smi list 2>/dev/null | grep -c "^GPU"
else
echo 1
fi
Expand Down Expand Up @@ -311,7 +313,7 @@ gpu_is_busy() {
fi
elif [[ "${vendor}" == "amd" ]]; then
local util
util=$(rocm-smi -d "${gpu_id}" --showuse 2>/dev/null | grep "GPU use" | awk '{print $NF}' | tr -d '%' || echo "0")
util=$(amd-smi metric -g "${gpu_id}" --usage 2>/dev/null | grep "GFX_ACTIVITY:" | awk '{print $(NF-1)}' || echo "0")
if [[ "${util}" -gt "${util_threshold}" ]]; then
return 0
fi
Expand Down
2 changes: 0 additions & 2 deletions cmake/modules/GpuCppLibrary.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -257,8 +257,6 @@ function(gpu_cpp_library)
# Append ROCM includes
target_include_directories(${lib_name} PUBLIC
${FBGEMM_HIP_INCLUDE}
${ROCRAND_INCLUDE}
${ROCM_SMI_INCLUDE}
${args_INCLUDE_DIRS})

else()
Expand Down
2 changes: 1 addition & 1 deletion fbgemm_gpu/cmake/Hip.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ if(HIP_FOUND)
set(FBGEMM_HIP_INCLUDE ${ROCM_PATH}/include ${FBGEMM_HIP_INCLUDE})
set(FBGEMM_HIP_INCLUDE ${hip_INCLUDE_DIRS} $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}> $<INSTALL_INTERFACE:include> ${FBGEMM_HIP_INCLUDE})

hip_include_directories(${FBGEMM_HIP_INCLUDE} ${ROCRAND_INCLUDE} ${ROCM_SMI_INCLUDE})
hip_include_directories(${FBGEMM_HIP_INCLUDE})

list (APPEND CMAKE_PREFIX_PATH ${HIP_PATH} ${ROCM_PATH})
endif()
Original file line number Diff line number Diff line change
Expand Up @@ -121,20 +121,28 @@ The AMDGPU display drivers must be installed on the system prior to all other
environment setup. The steps provided by
`AMD <https://docs.amd.com/bundle/ROCm-Installation-Guide-v5.5/page/How_to_Install_ROCm.html>`__
are the most authoritative instructions for doing this. Driver setup may be
verified with the ``rocm-smi`` command:
verified with the ``amd-smi`` command:

.. code:: sh

rocm-smi

======================= ROCm System Management Interface =======================
================================= Concise Info =================================
GPU Temp (DieEdge) AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU%
0 33.0c 37.0W 300Mhz 1200Mhz 0% auto 290.0W 0% 0%
1 32.0c 39.0W 300Mhz 1200Mhz 0% auto 290.0W 0% 0%
2 33.0c 37.0W 300Mhz 1200Mhz 0% auto 290.0W 0% 0%
================================================================================
============================= End of ROCm SMI Log ==============================
amd-smi

+------------------------------------------------------------------------------+
| AMD-SMI 26.3.0+615aab95ed |
| amdgpu Version: 6.14.19 |
| ROCm Version: 7.3.0 |
| VBIOS Version: 020.001.000.060.000000 |
| Platform: Linux Baremetal |
|-------------------------------------+----------------------------------------|
| BDF GPU-Name | Mem-Uti Temp UEC Power-Usage |
| GPU HIP-ID OAM-ID Partition-Mode | GFX-Uti Fan Mem-Usage |
|=====================================+========================================|
| 0000:43:00.0 AMD Radeon RX 6800 XT | 0 % 32 °C 0 12/272 W |
| 0 1 N/A N/A | 0 % 0.0 % 16/16368 MB |
|-------------------------------------+----------------------------------------|
| 0000:63:00.0 Radeon RX 7900 XT | 0 % 41 °C 0 50/257 W |
| 1 0 N/A N/A | 4 % 0.0 % 26/20464 MB |
+-------------------------------------+----------------------------------------+

Set Up the ROCm Docker Container and Conda Environment
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ For ROCm machines, testing against a ROCm GPU needs to be enabled with
# Specify the specific HIP devices to run the tests on
#
# NOTE: This is necessary if PyTorch is unable to see the devices that
# `rocm-smi --showproductname` can see
# `amd-smi static --asic` can see
export HIP_VISIBLE_DEVICES=0,1,2,3

# Enable for debugging kernel executions
Expand Down
63 changes: 39 additions & 24 deletions fbgemm_gpu/src/topology_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,40 +15,55 @@

#ifdef USE_ROCM
#include <inttypes.h>
#include "amd_smi/amdsmi.h"
#include "hip/hip_runtime.h"
#include "rocm_smi/rocm_smi.h"

#define RSMI_CHECK(fn) \
do { \
rsmi_status_t ret = (fn); \
TORCH_CHECK_EQ((ret), RSMI_STATUS_SUCCESS); \
#define AMDSMI_CHECK(fn) \
do { \
amdsmi_status_t ret = (fn); \
TORCH_CHECK_EQ((ret), AMDSMI_STATUS_SUCCESS); \
} while (0)

#define RSMI_DEVICE_PCI_BUS_ID_BUFFER_SIZE 16
#define AMDSMI_DEVICE_PCI_BUS_ID_BUFFER_SIZE 16

namespace fbgemm_gpu {
AdjacencyMatrix<Links> get_nvlink_matrix() {
auto world_size = at::cuda::getNumGPUs();
RSMI_CHECK(rsmi_init(0));
AMDSMI_CHECK(amdsmi_init(AMDSMI_INIT_AMD_GPUS));

// Note that ROCm_SMI uses a different numbering method to ROCm runtime,
// Note that AMD SMI uses a different numbering method to ROCm runtime,
// so we need to learn the mapping by using the bus ID.
uint32_t device_count;
RSMI_CHECK(rsmi_num_monitor_devices(&device_count));

std::unordered_map<Node, uint32_t> rocm_device_to_rsmi_device;
// Get all sockets, then collect all GPU processor handles across sockets.
uint32_t socket_count = 0;
AMDSMI_CHECK(amdsmi_get_socket_handles(&socket_count, nullptr));
std::vector<amdsmi_socket_handle> sockets(socket_count);
AMDSMI_CHECK(amdsmi_get_socket_handles(&socket_count, sockets.data()));

std::vector<amdsmi_processor_handle> processor_handles;
for (uint32_t s = 0; s < socket_count; s++) {
uint32_t device_count = 0;
AMDSMI_CHECK(amdsmi_get_processor_handles(sockets[s], &device_count, nullptr));
std::vector<amdsmi_processor_handle> socket_handles(device_count);
AMDSMI_CHECK(amdsmi_get_processor_handles(
sockets[s], &device_count, socket_handles.data()));
processor_handles.insert(
processor_handles.end(), socket_handles.begin(), socket_handles.end());
}

for (const auto i : c10::irange(device_count)) {
std::unordered_map<Node, amdsmi_processor_handle> hip_device_to_handle;

for (const auto& handle : processor_handles) {
uint64_t pci_info;
RSMI_CHECK(rsmi_dev_pci_id_get(i, &pci_info));
AMDSMI_CHECK(amdsmi_get_gpu_bdf_id(handle, &pci_info));
uint64_t domain, bus, device, function;
domain = (pci_info >> 32) & 0xffffffff;
bus = (pci_info >> 8) & 0xff;
device = (pci_info >> 3) & 0x1f;
function = pci_info & 0x7;
// Different from CUDA, we do not get the PCI BUS ID as a char* and we need
// to reconstruct it.
char pci_bus_id_str[RSMI_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
char pci_bus_id_str[AMDSMI_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
sprintf(
pci_bus_id_str,
"%04" PRIu64 ":%02" PRIu64 ":%02" PRIu64 ".%0" PRIu64,
Expand All @@ -57,15 +72,15 @@ AdjacencyMatrix<Links> get_nvlink_matrix() {
device,
function);

std::array<char, RSMI_DEVICE_PCI_BUS_ID_BUFFER_SIZE> pci_bus_id;
std::array<char, AMDSMI_DEVICE_PCI_BUS_ID_BUFFER_SIZE> pci_bus_id;
std::copy(
&pci_bus_id_str[0],
&pci_bus_id_str[RSMI_DEVICE_PCI_BUS_ID_BUFFER_SIZE],
&pci_bus_id_str[AMDSMI_DEVICE_PCI_BUS_ID_BUFFER_SIZE],
pci_bus_id.data());
int32_t node = 0;
auto err = hipDeviceGetByPCIBusId(&node, pci_bus_id.data());
if (err == hipSuccess) {
rocm_device_to_rsmi_device.insert({node, i});
hip_device_to_handle.insert({node, handle});
} else {
// flush the last error - this can occur when e.g. we set
// HIP_VISIBLE_DEVICES to a subset of the available GPUs in the system.
Expand All @@ -75,22 +90,22 @@ AdjacencyMatrix<Links> get_nvlink_matrix() {

std::vector<Links> links(world_size * world_size);
for (const auto i : c10::irange(world_size)) {
auto src_rsmi_device = rocm_device_to_rsmi_device.find(i);
if (src_rsmi_device != rocm_device_to_rsmi_device.end()) {
auto src = hip_device_to_handle.find(i);
if (src != hip_device_to_handle.end()) {
for (const auto j : c10::irange(world_size)) {
auto dst_rsmi_device = rocm_device_to_rsmi_device.find(j);
if (dst_rsmi_device != rocm_device_to_rsmi_device.end()) {
auto dst = hip_device_to_handle.find(j);
if (dst != hip_device_to_handle.end()) {
bool is_active;
RSMI_CHECK(rsmi_is_P2P_accessible(
src_rsmi_device->second, dst_rsmi_device->second, &is_active));
AMDSMI_CHECK(
amdsmi_is_P2P_accessible(src->second, dst->second, &is_active));
if (is_active) {
links[i * world_size + j] += 1;
}
}
}
}
}
RSMI_CHECK(rsmi_shut_down());
AMDSMI_CHECK(amdsmi_shut_down());
return [=](Node i, Node j) {
TORCH_CHECK_LT(i, world_size);
TORCH_CHECK_LT(j, world_size);
Expand Down
Loading