diff --git a/libvmaf/src/cuda/common.h b/libvmaf/src/cuda/common.h index 5a0690ddc..e8f479c45 100644 --- a/libvmaf/src/cuda/common.h +++ b/libvmaf/src/cuda/common.h @@ -20,7 +20,6 @@ #ifndef __VMAF_SRC_CUDA_COMMON_H__ #define __VMAF_SRC_CUDA_COMMON_H__ -#include #include #include "config.h" diff --git a/libvmaf/src/cuda/cuda_helper.cuh b/libvmaf/src/cuda/cuda_helper.cuh index 82b0b9d91..803eaa4b4 100644 --- a/libvmaf/src/cuda/cuda_helper.cuh +++ b/libvmaf/src/cuda/cuda_helper.cuh @@ -29,7 +29,11 @@ #include "assert.h" #include "stdio.h" +#ifdef DEVICE_CODE +#include +#else #include +#endif #define DIV_ROUND_UP(x, y) (((x) + (y)-1) / (y)) #define MAX(x, y) (((x) > (y)) ? (x) : (y)) diff --git a/libvmaf/src/feature/cuda/integer_adm/adm_cm.cu b/libvmaf/src/feature/cuda/integer_adm/adm_cm.cu index ea3d88dec..9ab0af8fe 100644 --- a/libvmaf/src/feature/cuda/integer_adm/adm_cm.cu +++ b/libvmaf/src/feature/cuda/integer_adm/adm_cm.cu @@ -16,7 +16,9 @@ * limitations under the License. * */ +#ifndef DEVICE_CODE #include "feature_collector.h" +#endif #include "cuda/integer_adm_cuda.h" #include "common.h" #include "cuda_helper.cuh" diff --git a/libvmaf/src/feature/cuda/integer_adm/adm_csf.cu b/libvmaf/src/feature/cuda/integer_adm/adm_csf.cu index a103727da..621868d05 100644 --- a/libvmaf/src/feature/cuda/integer_adm/adm_csf.cu +++ b/libvmaf/src/feature/cuda/integer_adm/adm_csf.cu @@ -17,7 +17,9 @@ * */ +#ifndef DEVICE_CODE #include "feature_collector.h" +#endif #include "cuda/integer_adm_cuda.h" #include "common.h" diff --git a/libvmaf/src/feature/cuda/integer_adm/adm_csf_den.cu b/libvmaf/src/feature/cuda/integer_adm/adm_csf_den.cu index e766010d2..c638ab203 100644 --- a/libvmaf/src/feature/cuda/integer_adm/adm_csf_den.cu +++ b/libvmaf/src/feature/cuda/integer_adm/adm_csf_den.cu @@ -17,7 +17,9 @@ * */ +#ifndef DEVICE_CODE #include "feature_collector.h" +#endif #include "cuda/integer_adm_cuda.h" #include "common.h" diff --git a/libvmaf/src/feature/cuda/integer_adm/adm_decouple.cu b/libvmaf/src/feature/cuda/integer_adm/adm_decouple.cu index d8e1ce374..c5e00c9f8 100644 --- a/libvmaf/src/feature/cuda/integer_adm/adm_decouple.cu +++ b/libvmaf/src/feature/cuda/integer_adm/adm_decouple.cu @@ -17,7 +17,9 @@ * */ +#ifndef DEVICE_CODE #include "feature_collector.h" +#endif #include "cuda/integer_adm_cuda.h" #include "common.h" diff --git a/libvmaf/src/feature/cuda/integer_adm/adm_dwt2.cu b/libvmaf/src/feature/cuda/integer_adm/adm_dwt2.cu index 149704175..3ecf18f83 100644 --- a/libvmaf/src/feature/cuda/integer_adm/adm_dwt2.cu +++ b/libvmaf/src/feature/cuda/integer_adm/adm_dwt2.cu @@ -17,7 +17,9 @@ * */ +#ifndef DEVICE_CODE #include "feature_collector.h" +#endif #include "cuda/integer_adm_cuda.h" #include "common.h" diff --git a/libvmaf/src/feature/integer_adm.h b/libvmaf/src/feature/integer_adm.h index bf3836460..91519f18c 100644 --- a/libvmaf/src/feature/integer_adm.h +++ b/libvmaf/src/feature/integer_adm.h @@ -132,10 +132,12 @@ struct dwt_model_params { }; // 0 -> Y, 1 -> Cb, 2 -> Cr +#ifndef __CUDACC__ static const struct dwt_model_params dwt_7_9_YCbCr_threshold[3] = { {.a = 0.495, .k = 0.466, .f0 = 0.401, .g = {1.501, 1.0, 0.534, 1.0}}, {.a = 1.633, .k = 0.353, .f0 = 0.209, .g = {1.520, 1.0, 0.502, 1.0}}, {.a = 0.944, .k = 0.521, .f0 = 0.404, .g = {1.868, 1.0, 0.516, 1.0}}}; +#endif /* * The following dwt basis function amplitudes, A(lambda,theta), are taken from diff --git a/libvmaf/src/meson.build b/libvmaf/src/meson.build index 80a34bc4f..5e7cfa902 100644 --- a/libvmaf/src/meson.build +++ b/libvmaf/src/meson.build @@ -34,8 +34,12 @@ is_asm_enabled = get_option('enable_asm') == true is_cuda_enabled = get_option('enable_cuda') == true is_avx512_enabled = get_option('enable_avx512') == true is_nvtx_enabled = get_option('enable_nvtx') == true +if is_nvtx_enabled and host_machine.system() == 'windows' + warning('NVTX is not supported on Windows, disabling.') + is_nvtx_enabled = false +endif -if is_nvtx_enabled +if is_nvtx_enabled cdata.set10('HAVE_NVTX', is_nvtx_enabled) endif if is_cuda_enabled @@ -310,23 +314,80 @@ if is_cuda_enabled ] gencode = [] if get_option('enable_nvcc') - cuda_lang = add_languages('cuda', required : true) - cuda_compiler = meson.get_compiler('cuda') nvcc_exe = find_program('nvcc') + # On Windows, nvcc requires MSVC's cl.exe as host compiler for preprocessing. + # We avoid adding cl.exe to PATH as that causes meson to pick MSVC as the + # default C compiler. Use vswhere + powershell to find cl.exe automatically. + if host_machine.system() == 'windows' + cl_find_result = run_command('powershell', '-NoProfile', '-Command', + '(Get-ChildItem -Path (& "${env:ProgramFiles(x86)}/Microsoft Visual Studio/Installer/vswhere.exe" -latest -products * -property installationPath) -Recurse -Filter cl.exe -ErrorAction SilentlyContinue | Where-Object { $_.DirectoryName -like "*HostX64*x64*" } | Select-Object -First 1).FullName', + check: false) + if cl_find_result.returncode() == 0 and cl_find_result.stdout().strip() != '' + cl_path = cl_find_result.stdout().strip() + message('Found MSVC cl.exe at: ' + cl_path) + nvcc_ccbin_flags = ['--allow-unsupported-compiler', '-ccbin', cl_path] + else + cl_exe = find_program('cl', required: false) + if cl_exe.found() + nvcc_ccbin_flags = ['--allow-unsupported-compiler', '-ccbin', cl_exe.full_path()] + else + error('MSVC cl.exe not found. nvcc on Windows requires Visual Studio Build Tools.') + endif + endif + + # Find MSVC and Windows SDK include dirs so cl.exe can find system + # headers when invoked by nvcc outside a vcvars environment. + # Pass them as -I flags directly to nvcc. + # Derive MSVC include path from cl.exe's location to ensure version match. + msvc_include_result = run_command('powershell', '-NoProfile', '-Command', + '$clPath = "' + cl_path + '"; ' + + '$msvcRoot = ($clPath -replace "\\\\bin\\\\.*$", ""); ' + + 'Write-Output $msvcRoot', + check: false) + winsdk_include_result = run_command('powershell', '-NoProfile', '-Command', + '$sdkInc = Get-ChildItem "${env:ProgramFiles(x86)}/Windows Kits/10/Include" -Directory | Sort-Object Name | Select-Object -Last 1; ' + + 'Write-Output $sdkInc.FullName', + check: false) + nvcc_host_includes = [] + if msvc_include_result.returncode() == 0 and winsdk_include_result.returncode() == 0 + msvc_root_dir = msvc_include_result.stdout().strip() + winsdk_inc_dir = winsdk_include_result.stdout().strip() + nvcc_host_includes = [ + '-I', msvc_root_dir / 'include', + '-I', winsdk_inc_dir / 'ucrt', + '-I', winsdk_inc_dir / 'shared', + '-I', winsdk_inc_dir / 'um', + ] + message('MSVC include: ' + msvc_root_dir / 'include') + message('UCRT include: ' + winsdk_inc_dir / 'ucrt') + else + warning('Could not find MSVC/Windows SDK include paths. nvcc compilation may fail.') + endif + else + nvcc_ccbin_flags = [] + nvcc_host_includes = [] + endif + + # Detect CUDA version from nvcc directly instead of add_languages('cuda'), + # which requires MSVC on Windows and fails with MinGW. + # nvcc --version outputs: "...Cuda compilation tools, release 12.6, V12.6.85..." + nvcc_ver_out = run_command(nvcc_exe, '--version', check: true) + cuda_version = nvcc_ver_out.stdout().strip().split('release ')[1].split(',')[0] + message('Found CUDA version = @0@'.format(cuda_version)) + gencode = [ '--fatbin', '-gencode=arch=compute_75,code=sm_75', '-gencode=arch=compute_80,code=sm_80', ] - message('Found CUDA version = @0@'.format(cuda_compiler.version())) - if cuda_compiler.version().version_compare('<13') + if cuda_version.version_compare('<13') gencode += '-gencode=arch=compute_50,code=compute_50' endif # always compile device code to enable quick startup on newer GPUs, for the last supported GPU also generate PTX for future compatibility - if cuda_compiler.version().version_compare('>11.8') + if cuda_version.version_compare('>11.8') gencode += '-gencode=arch=compute_90,code=sm_90' - if cuda_compiler.version().version_compare('>12.8') + if cuda_version.version_compare('>12.8') gencode += [ '-gencode=arch=compute_100,code=sm_100', '-gencode=arch=compute_120,code=sm_120', @@ -353,7 +414,7 @@ if is_cuda_enabled message('CUDA gencode = @0@'.format(gencode)) - cuda_flags = [] + cuda_flags = ['-D_USE_MATH_DEFINES'] if get_option('buildtype') == 'debug' cuda_flags += ['-DCUDA_DEBUG', '-G'] else @@ -378,7 +439,7 @@ if is_cuda_enabled '-I', '../src/feature', '-I', '../src/' + cuda_dir, '-DDEVICE_CODE', - ] + cuda_flags + ] + cuda_flags + nvcc_ccbin_flags + nvcc_host_includes ) ptx_files += {name : [t]} endforeach @@ -414,7 +475,6 @@ if is_cuda_enabled cuda_inc, ], c_args : vmaf_cflags_common, - cuda_args: cuda_flags # + ['-gencode', 'arch=compute_86,code=sm_86' ] #, '--use_fast_math'] ) common_cuda_objects += cuda_static_lib.extract_all_objects() diff --git a/libvmaf/src/picture.h b/libvmaf/src/picture.h index abc50789e..86ff0487e 100644 --- a/libvmaf/src/picture.h +++ b/libvmaf/src/picture.h @@ -20,9 +20,14 @@ #define __VMAF_SRC_PICTURE_H__ #ifdef HAVE_CUDA +#ifdef DEVICE_CODE +#include +typedef struct VmafCudaState VmafCudaState; +#else #include #include "libvmaf/libvmaf_cuda.h" #endif +#endif #include "libvmaf/picture.h" enum VmafPictureBufferType {