Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,11 @@
import sys

from cuda.pathfinder._dynamic_libs.find_nvidia_dynamic_lib import _FindNvidiaDynamicLib
from cuda.pathfinder._dynamic_libs.load_dl_common import LoadedDL, load_dependencies
from cuda.pathfinder._dynamic_libs.load_dl_common import DynamicLibNotFoundError, LoadedDL, load_dependencies
from cuda.pathfinder._dynamic_libs.supported_nvidia_libs import (
SUPPORTED_LINUX_SONAMES,
SUPPORTED_WINDOWS_DLLS,
)
from cuda.pathfinder._utils.platform_aware import IS_WINDOWS

if IS_WINDOWS:
Expand All @@ -22,8 +26,44 @@
load_with_system_search,
)

# All libnames recognized by load_nvidia_dynamic_lib, across all categories
# (CTK, third-party, driver). Built from the platform-appropriate soname/DLL
# registry so that platform-specific libs (e.g. cufile on Linux) are included
# only where they apply.
_ALL_SUPPORTED_LIBNAMES: frozenset[str] = frozenset(
(SUPPORTED_WINDOWS_DLLS if IS_WINDOWS else SUPPORTED_LINUX_SONAMES).keys()
)

# Driver libraries: shipped with the NVIDIA display driver, always on the
# system linker path. These skip all CTK search steps (site-packages,
# conda, CUDA_HOME, canary) and go straight to system search.
_DRIVER_ONLY_LIBNAMES = frozenset(("cuda", "nvml"))


def _load_driver_lib_no_cache(libname: str) -> LoadedDL:
"""Load an NVIDIA driver library (system-search only).

Driver libs (libcuda, libnvidia-ml) are part of the display driver, not
the CUDA Toolkit. They are always on the system linker path, so the
full CTK search cascade (site-packages, conda, CUDA_HOME, canary) is
unnecessary.
"""
loaded = check_if_already_loaded_from_elsewhere(libname, False)
if loaded is not None:
return loaded
loaded = load_with_system_search(libname)
if loaded is not None:
return loaded
raise DynamicLibNotFoundError(
f'"{libname}" is an NVIDIA driver library and can only be found via'
f" system search. Ensure the NVIDIA display driver is installed."
)


def _load_lib_no_cache(libname: str) -> LoadedDL:
if libname in _DRIVER_ONLY_LIBNAMES:
return _load_driver_lib_no_cache(libname)

finder = _FindNvidiaDynamicLib(libname)
abs_path = finder.try_site_packages()
if abs_path is not None:
Expand Down Expand Up @@ -83,6 +123,7 @@ def load_nvidia_dynamic_lib(libname: str) -> LoadedDL:
https://github.com/NVIDIA/cuda-python/issues/1011

Raises:
ValueError: If ``libname`` is not a recognized library name.
DynamicLibNotFoundError: If the library cannot be found or loaded.
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

CuPy notice this change in CI (if probably just cupy.show_runtime() breaking on windows). (Checking for nccl.)

I.e. the CuPy code currently assumed that load_nvidia_dynamic_lib always raises DynamicLibNotFoundError. It could be fixed on either side, but maybe it makes sense to just keep raising DynamicLibNotFoundError (with the new message), so that downstream try/except is unaffected?

(Although, I am not sure if it may be nice to be able to distinguish the two errors.)

CC @leofang

RuntimeError: If Python is not 64-bit.

Expand Down Expand Up @@ -123,6 +164,18 @@ def load_nvidia_dynamic_lib(libname: str) -> LoadedDL:

- If set, use ``CUDA_HOME`` or ``CUDA_PATH`` (in that order).

**Driver libraries** (``"cuda"``, ``"nvml"``):

These are part of the NVIDIA display driver (not the CUDA Toolkit) and
are always on the system linker path. For these libraries the search
is simplified to:

0. Already loaded in the current process
1. OS default mechanisms (``dlopen`` / ``LoadLibraryW``)

The CTK-specific steps (site-packages, conda, ``CUDA_HOME``, canary
probe) are skipped entirely.

Notes:
The search is performed **per library**. There is currently no mechanism to
guarantee that multiple libraries are all resolved from the same location.
Expand All @@ -135,4 +188,6 @@ def load_nvidia_dynamic_lib(libname: str) -> LoadedDL:
f" Currently running: {pointer_size_bits}-bit Python"
f" {sys.version_info.major}.{sys.version_info.minor}"
)
if libname not in _ALL_SUPPORTED_LIBNAMES:
raise ValueError(f"Unsupported library name: {libname!r}. Supported names: {sorted(_ALL_SUPPORTED_LIBNAMES)}")
return _load_lib_no_cache(libname)
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,15 @@
"nvpl_fftw": ("libnvpl_fftw.so.0",),
"nvshmem_host": ("libnvshmem_host.so.3",),
}
SUPPORTED_LINUX_SONAMES = SUPPORTED_LINUX_SONAMES_CTK | SUPPORTED_LINUX_SONAMES_OTHER
# Driver libraries: shipped with the NVIDIA driver, always on the system
# linker path. Only system search is needed (no site-packages / conda /
# CUDA_HOME). Note the non-standard naming: "cuda" → libcuda.so.1,
# "nvml" → libnvidia-ml.so.1.
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
# CUDA_HOME). Note the non-standard naming: "cuda" → libcuda.so.1,
# "nvml" → libnvidia-ml.so.1.
# CUDA_HOME).

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(seems entirely obvious; avoids duplicating names in comment that may go stale)

SUPPORTED_LINUX_SONAMES_DRIVER = {
"cuda": ("libcuda.so.1",),
"nvml": ("libnvidia-ml.so.1",),
}
SUPPORTED_LINUX_SONAMES = SUPPORTED_LINUX_SONAMES_CTK | SUPPORTED_LINUX_SONAMES_OTHER | SUPPORTED_LINUX_SONAMES_DRIVER

# Based on these files:
# cuda_12.0.1_528.33_windows.exe
Expand Down Expand Up @@ -338,7 +346,11 @@
"cutensor": ("cutensor.dll",),
"cutensorMg": ("cutensorMg.dll",),
}
SUPPORTED_WINDOWS_DLLS = SUPPORTED_WINDOWS_DLLS_CTK | SUPPORTED_WINDOWS_DLLS_OTHER
SUPPORTED_WINDOWS_DLLS_DRIVER = {
"cuda": ("nvcuda.dll",),
"nvml": ("nvml.dll",),
}
SUPPORTED_WINDOWS_DLLS = SUPPORTED_WINDOWS_DLLS_CTK | SUPPORTED_WINDOWS_DLLS_OTHER | SUPPORTED_WINDOWS_DLLS_DRIVER

LIBNAMES_REQUIRING_OS_ADD_DLL_DIRECTORY = (
"cufft",
Expand Down
127 changes: 127 additions & 0 deletions cuda_pathfinder/tests/test_driver_lib_loading.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

"""Tests for NVIDIA driver library loading ("cuda", "nvml").

These libraries are part of the display driver, not the CUDA Toolkit.
They use a simplified system-search-only path, skipping site-packages,
conda, CUDA_HOME, and the canary probe.
"""

import pytest

from cuda.pathfinder._dynamic_libs.load_dl_common import DynamicLibNotFoundError, LoadedDL
from cuda.pathfinder._dynamic_libs.load_nvidia_dynamic_lib import (
_DRIVER_ONLY_LIBNAMES,
_load_driver_lib_no_cache,
_load_lib_no_cache,
)

_MODULE = "cuda.pathfinder._dynamic_libs.load_nvidia_dynamic_lib"


def _make_loaded_dl(path, found_via):
return LoadedDL(path, False, 0xDEAD, found_via)


# ---------------------------------------------------------------------------
# _DRIVER_ONLY_LIBNAMES registry
# ---------------------------------------------------------------------------


@pytest.mark.parametrize("libname", ["cuda", "nvml"])
def test_driver_only_libnames_contains(libname):
assert libname in _DRIVER_ONLY_LIBNAMES


@pytest.mark.parametrize("libname", ["cudart", "nvrtc", "cublas", "nvvm"])
def test_driver_only_libnames_excludes_ctk_libs(libname):
assert libname not in _DRIVER_ONLY_LIBNAMES
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Best removed: these just seem to exercise in and not in based on made-up lists of specific libnames.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep, will do.



# ---------------------------------------------------------------------------
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need some real tests; mocks are great for hard-to-reach-branch coverage, but other than that, they give a false sense of "oh this works".

We should have tests for real loading, and track them with INFOs, so that we can validate that it actually works by inspecting the test logs (CI, QA).

We should have at least some all_must_work on platforms where we're sure it must work. Without, we'll be blind to regressions.

I realize this is unusual, but that's the nature of a piece of software with the sole purpose of adapting to highly diverse environments that we don't control.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we avoid the info logging stuff until we merge #1593? I'd like to get that in so I don't have to redo related changes from that PR.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, nevermind about the INFO logging, I'll just do it here. It's more important to just get this feature in.

# _load_driver_lib_no_cache
# ---------------------------------------------------------------------------


def test_driver_lib_returns_already_loaded(mocker):
already = LoadedDL("/usr/lib/libcuda.so.1", True, 0xBEEF, "was-already-loaded-from-elsewhere")
mocker.patch(f"{_MODULE}.check_if_already_loaded_from_elsewhere", return_value=already)
mocker.patch(f"{_MODULE}.load_with_system_search")

result = _load_driver_lib_no_cache("cuda")

assert result is already
# system search should not have been called
from cuda.pathfinder._dynamic_libs import load_nvidia_dynamic_lib as mod

mod.load_with_system_search.assert_not_called()


def test_driver_lib_falls_through_to_system_search(mocker):
loaded = _make_loaded_dl("/usr/lib/libcuda.so.1", "system-search")
mocker.patch(f"{_MODULE}.check_if_already_loaded_from_elsewhere", return_value=None)
mocker.patch(f"{_MODULE}.load_with_system_search", return_value=loaded)

result = _load_driver_lib_no_cache("cuda")

assert result is loaded
assert result.found_via == "system-search"


def test_driver_lib_raises_when_not_found(mocker):
mocker.patch(f"{_MODULE}.check_if_already_loaded_from_elsewhere", return_value=None)
mocker.patch(f"{_MODULE}.load_with_system_search", return_value=None)

with pytest.raises(DynamicLibNotFoundError, match="NVIDIA driver library"):
_load_driver_lib_no_cache("nvml")


def test_driver_lib_does_not_search_site_packages(mocker):
"""Driver libs must not go through the CTK search cascade."""
loaded = _make_loaded_dl("/usr/lib/libcuda.so.1", "system-search")
mocker.patch(f"{_MODULE}.check_if_already_loaded_from_elsewhere", return_value=None)
mocker.patch(f"{_MODULE}.load_with_system_search", return_value=loaded)

from cuda.pathfinder._dynamic_libs.find_nvidia_dynamic_lib import _FindNvidiaDynamicLib

spy = mocker.spy(_FindNvidiaDynamicLib, "try_site_packages")
_load_driver_lib_no_cache("cuda")
spy.assert_not_called()


# ---------------------------------------------------------------------------
# _load_lib_no_cache dispatches driver libs correctly
# ---------------------------------------------------------------------------


@pytest.mark.parametrize("libname", sorted(_DRIVER_ONLY_LIBNAMES))
def test_load_lib_no_cache_dispatches_to_driver_path(libname, mocker):
loaded = _make_loaded_dl(f"/usr/lib/fake_{libname}.so", "system-search")
mock_driver = mocker.patch(f"{_MODULE}._load_driver_lib_no_cache", return_value=loaded)

result = _load_lib_no_cache(libname)

assert result is loaded
mock_driver.assert_called_once_with(libname)


def test_load_lib_no_cache_does_not_dispatch_ctk_lib_to_driver_path(mocker):
"""Ensure regular CTK libs don't take the driver shortcut."""
mock_driver = mocker.patch(f"{_MODULE}._load_driver_lib_no_cache")
# Let the normal path run far enough to prove the driver path wasn't used.
# We'll make it fail quickly at check_if_already_loaded_from_elsewhere.
from cuda.pathfinder._dynamic_libs.find_nvidia_dynamic_lib import _FindNvidiaDynamicLib

mocker.patch.object(_FindNvidiaDynamicLib, "try_site_packages", return_value=None)
mocker.patch.object(_FindNvidiaDynamicLib, "try_with_conda_prefix", return_value=None)
mocker.patch(f"{_MODULE}.check_if_already_loaded_from_elsewhere", return_value=None)
mocker.patch(f"{_MODULE}.load_dependencies")
mocker.patch(
f"{_MODULE}.load_with_system_search",
return_value=_make_loaded_dl("/usr/lib/libcudart.so.13", "system-search"),
)

_load_lib_no_cache("cudart")

mock_driver.assert_not_called()
17 changes: 10 additions & 7 deletions cuda_pathfinder/tests/test_load_nvidia_dynamic_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import json
import os
import platform
from unittest.mock import patch

import pytest
import spawned_process_runner
Expand Down Expand Up @@ -62,12 +61,16 @@ def test_supported_libnames_windows_libnames_requiring_os_add_dll_directory_cons
)


def test_runtime_error_on_non_64bit_python():
with (
patch("struct.calcsize", return_value=3), # fake 24-bit pointer
pytest.raises(RuntimeError, match=r"requires 64-bit Python\. Currently running: 24-bit Python"),
):
load_nvidia_dynamic_lib("not_used")
def test_runtime_error_on_non_64bit_python(mocker):
mocker.patch("struct.calcsize", return_value=3) # fake 24-bit pointer
with pytest.raises(RuntimeError, match=r"requires 64-bit Python\. Currently running: 24-bit Python"):
load_nvidia_dynamic_lib("cudart")


@pytest.mark.parametrize("libname", ["bogus", "not_a_real_lib", "cupti"])
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need to include "cupti" here? I'd rather stay clear of mentioning any real names here, if for nothing else but avoiding head scratching when searching the codebase for cupti.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not really, fixes incoming.

def test_unsupported_libname_raises_value_error(libname):
with pytest.raises(ValueError, match=rf"Unsupported library name: '{libname}'.*cudart"):
load_nvidia_dynamic_lib(libname)


IMPORTLIB_METADATA_DISTRIBUTIONS_NAMES = {
Expand Down
Loading