diff --git a/.github/ci_config.yml b/.github/ci_config.yml index 4ab61601..406f253c 100644 --- a/.github/ci_config.yml +++ b/.github/ci_config.yml @@ -15,7 +15,7 @@ platforms: BASE_IMAGE: nvcr.io/nvidia/pytorch:24.10-py3 SKIP_APT: "1" PIP_INDEX_URL: https://pypi.tuna.tsinghua.edu.cn/simple - setup: pip install .[dev] --no-build-isolation + setup: pip install .[dev] --no-build-isolation --config-settings=cmake.define.AUTO_DETECT_DEVICES=OFF --config-settings=cmake.define.WITH_CPU=ON --config-settings=cmake.define.WITH_NVIDIA=ON jobs: gpu: type: unittest @@ -50,12 +50,11 @@ platforms: - /lib/firmware:/lib/firmware - /usr/src:/usr/src - /lib/modules:/lib/modules - setup: python -m pip install packaging exceptiongroup typing-extensions pygments pybind11 libclang && python -m pip install . --no-build-isolation --no-deps + setup: python -m pip install packaging exceptiongroup typing-extensions pygments pybind11 libclang && python -m pip install . --no-build-isolation --no-deps --config-settings=cmake.define.AUTO_DETECT_DEVICES=OFF --config-settings=cmake.define.WITH_CPU=ON --config-settings=cmake.define.WITH_ILUVATAR=ON jobs: gpu: type: unittest resources: - gpu_ids: "0" ngpus: 1 gpu_style: none memory: 32GB @@ -65,7 +64,7 @@ platforms: junit_path: test-results.xml stages: - name: test - run: pytest tests/ --devices iluvatar -n 4 -v --tb=short --junitxml=/workspace/results/test-results.xml + run: pytest tests/ --devices iluvatar -n 2 -v --tb=short --junitxml=/workspace/results/test-results.xml metax: runner_label: Metax @@ -80,7 +79,7 @@ platforms: - "--privileged" - "--ulimit=memlock=-1" - "--ulimit=stack=67108864" - setup: pip install .[dev] --no-build-isolation + setup: pip install .[dev] --no-build-isolation --config-settings=cmake.define.AUTO_DETECT_DEVICES=OFF --config-settings=cmake.define.WITH_CPU=ON --config-settings=cmake.define.WITH_METAX=ON jobs: gpu: type: unittest @@ -107,7 +106,7 @@ platforms: PIP_INDEX_URL: https://pypi.org/simple docker_args: - "--privileged" - setup: pip install .[dev] --no-build-isolation + setup: pip install .[dev] --no-build-isolation --config-settings=cmake.define.AUTO_DETECT_DEVICES=OFF --config-settings=cmake.define.WITH_CPU=ON --config-settings=cmake.define.WITH_MOORE=ON jobs: gpu: type: unittest @@ -133,7 +132,7 @@ platforms: PIP_INDEX_URL: https://pypi.org/simple docker_args: - "--privileged" - setup: pip install .[dev] --no-build-isolation + setup: pip install .[dev] --no-build-isolation --config-settings=cmake.define.AUTO_DETECT_DEVICES=OFF --config-settings=cmake.define.WITH_CPU=ON --config-settings=cmake.define.WITH_CAMBRICON=ON jobs: gpu: type: unittest @@ -168,7 +167,7 @@ platforms: - "--group-add=video" volumes: - /opt/hyhal:/opt/hyhal:ro - setup: pip install .[dev] --no-build-isolation + setup: pip install .[dev] --no-build-isolation --config-settings=cmake.define.AUTO_DETECT_DEVICES=OFF --config-settings=cmake.define.WITH_CPU=ON --config-settings=cmake.define.WITH_HYGON=ON jobs: gpu: type: unittest @@ -205,7 +204,7 @@ platforms: - /usr/local/bin/npu-smi:/usr/local/bin/npu-smi:ro env: ASCEND_HOME_PATH: /usr/local/Ascend/ascend-toolkit/latest - setup: pip install .[dev] --no-build-isolation + setup: pip install .[dev] --no-build-isolation --config-settings=cmake.define.AUTO_DETECT_DEVICES=OFF --config-settings=cmake.define.WITH_CPU=ON --config-settings=cmake.define.WITH_ASCEND=ON jobs: npu: type: unittest diff --git a/CMakeLists.txt b/CMakeLists.txt index 4cdc5487..0fea233a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -442,6 +442,7 @@ if(WITH_MOORE) find_library(MUSA_LIB NAMES musa HINTS "${MUSA_ROOT}/lib" REQUIRED) find_library(MUSART_LIB NAMES musart HINTS "${MUSA_ROOT}/lib" REQUIRED) find_library(MUBLAS_LIB NAMES mublas HINTS "${MUSA_ROOT}/lib" REQUIRED) + find_library(MOORE_OPENMP_LIB NAMES omp iomp5 HINTS "${MUSA_ROOT}/lib" "${MUSA_ROOT}/lib64") endif() if(WITH_CAMBRICON) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 4b0ca302..0db18c1a 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -178,6 +178,9 @@ if(WITH_MOORE) target_include_directories(infiniops PUBLIC "${MUSA_ROOT}/include") target_link_libraries(infiniops PUBLIC ${MUSA_LIB} ${MUSART_LIB} ${MUBLAS_LIB}) + if(MOORE_OPENMP_LIB) + target_link_libraries(infiniops PUBLIC ${MOORE_OPENMP_LIB}) + endif() list(APPEND DEVICE_LIST "moore") endif() @@ -542,10 +545,39 @@ if(GENERATE_OPERATOR_CALL_INSTANTIATIONS) file(GLOB_RECURSE OPERATOR_CALL_INSTANTIATION_SOURCES CONFIGURE_DEPENDS "${PROJECT_SOURCE_DIR}/generated/src/operator_call_instantiations_*.cc") + set(_operator_call_instantiation_job_pool_arg) + if(WITH_TORCH AND CMAKE_GENERATOR MATCHES "Ninja") + set(INFINIOPS_OPERATOR_CALL_INSTANTIATION_COMPILE_JOBS "2" CACHE STRING + "Maximum concurrent generated operator call instantiation compilations") + set_property(GLOBAL APPEND PROPERTY JOB_POOLS + operator_call_instantiation_compile=${INFINIOPS_OPERATOR_CALL_INSTANTIATION_COMPILE_JOBS}) + set(_operator_call_instantiation_job_pool_arg + JOB_POOL operator_call_instantiation_compile) + endif() + if(WITH_NVIDIA OR WITH_HYGON) set_source_files_properties(${OPERATOR_CALL_INSTANTIATION_SOURCES} PROPERTIES LANGUAGE CUDA) - target_sources(infiniops PRIVATE ${OPERATOR_CALL_INSTANTIATION_SOURCES}) + if(WITH_TORCH AND CMAKE_GENERATOR MATCHES "Ninja") + add_library(infiniops_operator_call_instantiation_objs OBJECT + ${OPERATOR_CALL_INSTANTIATION_SOURCES}) + set_target_properties(infiniops_operator_call_instantiation_objs + PROPERTIES + CUDA_STANDARD 17 + CUDA_STANDARD_REQUIRED ON + JOB_POOL_COMPILE operator_call_instantiation_compile + POSITION_INDEPENDENT_CODE ON) + target_include_directories(infiniops_operator_call_instantiation_objs PRIVATE + $) + target_compile_definitions(infiniops_operator_call_instantiation_objs PRIVATE + $) + target_compile_options(infiniops_operator_call_instantiation_objs PRIVATE + $) + target_sources(infiniops PRIVATE + $) + else() + target_sources(infiniops PRIVATE ${OPERATOR_CALL_INSTANTIATION_SOURCES}) + endif() elseif(WITH_ILUVATAR) set(_iluvatar_call_instantiation_include_flags "-I${CMAKE_CURRENT_SOURCE_DIR}" @@ -591,6 +623,7 @@ if(GENERATE_OPERATOR_CALL_INSTANTIATIONS) -c "${_src}" -o "${_obj}" DEPENDS "${_src}" ${_depfile_arg} + ${_operator_call_instantiation_job_pool_arg} COMMENT "Compiling ${_name}.cc with CoreX clang++" VERBATIM )