diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index 03a914bda..afae73268 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -10,46 +10,50 @@ defaults: jobs: build: runs-on: ubuntu-latest - name: '${{ matrix.sys.compiler }} ${{ matrix.sys.version }} - ${{ matrix.sys.flags }}' + name: '${{ matrix.sys.compiler }} ${{ matrix.sys.version }} - ${{ matrix.sys.preset }} - ${{ matrix.sys.flags }}' strategy: matrix: sys: - - { compiler: 'gcc', version: '12', flags: 'force_no_instr_set' } - - { compiler: 'gcc', version: '13', flags: 'enable_xtl_complex' } - - { compiler: 'gcc', version: '14', flags: 'avx' } - - { compiler: 'gcc', version: '14', flags: 'avx2' } - - { compiler: 'gcc', version: '13', flags: 'avx512' } - - { compiler: 'gcc', version: '10', flags: 'avx512' } - - { compiler: 'gcc', version: '12', flags: 'i386' } - - { compiler: 'gcc', version: '13', flags: 'avx512pf' } - - { compiler: 'gcc', version: '13', flags: 'avx512vbmi' } - - { compiler: 'gcc', version: '14', flags: 'avx512vbmi2' } - - { compiler: 'gcc', version: '13', flags: 'avx512vnni' } - - { compiler: 'clang', version: '16', flags: 'force_no_instr_set' } - - { compiler: 'clang', version: '16', flags: 'enable_xtl_complex' } - - { compiler: 'clang', version: '17', flags: 'avx' } - - { compiler: 'clang', version: '17', flags: 'sse3' } - - { compiler: 'clang', version: '18', flags: 'avx512' } - - { compiler: 'clang', version: '18', flags: 'avx_128' } - - { compiler: 'clang', version: '18', flags: 'avx2_128' } - - { compiler: 'clang', version: '18', flags: 'avx512vl_128' } - - { compiler: 'clang', version: '18', flags: 'avx512vl_256' } + - { compiler: 'gcc', version: '12', flags: 'force_no_instr_set', preset: 'native' } + - { compiler: 'gcc', version: '13', flags: 'enable_xtl_complex', preset: 'native' } + - { compiler: 'gcc', version: '14', flags: '', preset: 'avx' } + - { compiler: 'gcc', version: '14', flags: '', preset: 'avx2' } + - { compiler: 'gcc', version: '13', flags: '', preset: 'avx512f' } + - { compiler: 'gcc', version: '10', flags: '', preset: 'avx512f' } + - { compiler: 'gcc', version: '12', flags: 'i386', preset: 'native' } + - { compiler: 'gcc', version: '13', flags: '', preset: 'avx512pf' } + - { compiler: 'gcc', version: '13', flags: '', preset: 'avx512vbmi' } + - { compiler: 'gcc', version: '14', flags: '', preset: 'avx512vbmi2' } + - { compiler: 'gcc', version: '13', flags: '', preset: 'avx512vnni_avx512bw' } + - { compiler: 'clang', version: '16', flags: 'force_no_instr_set', preset: 'native' } + - { compiler: 'clang', version: '16', flags: 'enable_xtl_complex', preset: 'native' } + - { compiler: 'clang', version: '17', flags: '', preset: 'avx' } + - { compiler: 'clang', version: '17', flags: '', preset: 'sse3' } + - { compiler: 'clang', version: '18', flags: '', preset: 'avx512f' } + - { compiler: 'clang', version: '18', flags: '', preset: 'avx_128' } + - { compiler: 'clang', version: '18', flags: '', preset: 'avx2_128' } + - { compiler: 'clang', version: '18', flags: '', preset: 'avx512vl_128' } + - { compiler: 'clang', version: '18', flags: '', preset: 'avx512vl_256' } steps: - - name: Setup compiler + - name: Setup GCC compiler if: ${{ matrix.sys.compiler == 'gcc' }} run: | GCC_VERSION=${{ matrix.sys.version }} sudo apt-get update sudo apt-get --no-install-suggests --no-install-recommends install g++-$GCC_VERSION - sudo dpkg --add-architecture i386 - sudo add-apt-repository ppa:ubuntu-toolchain-r/test - sudo apt-get update - sudo apt-get --no-install-suggests --no-install-recommends install gcc-$GCC_VERSION-multilib g++-$GCC_VERSION-multilib linux-libc-dev:i386 - CC=gcc-$GCC_VERSION - echo "CC=$CC" >> $GITHUB_ENV - CXX=g++-$GCC_VERSION - echo "CXX=$CXX" >> $GITHUB_ENV - - name: Setup compiler + # Setup i386 as needed + if [[ '${{ matrix.sys.flags }}' == 'i386' ]]; then + sudo dpkg --add-architecture i386 + sudo add-apt-repository ppa:ubuntu-toolchain-r/test + sudo apt-get update + sudo apt-get --no-install-suggests --no-install-recommends install \ + gcc-$GCC_VERSION-multilib g++-$GCC_VERSION-multilib linux-libc-dev:i386 + fi + # Export compiler as environment var + echo "CC=gcc-$GCC_VERSION" >> $GITHUB_ENV + echo "CXX=g++-$GCC_VERSION" >> $GITHUB_ENV + + - name: Setup Clang compiler if: ${{ matrix.sys.compiler == 'clang' }} run: | LLVM_VERSION=${{ matrix.sys.version }} @@ -57,91 +61,49 @@ jobs: sudo apt-get --no-install-suggests --no-install-recommends install clang-$LLVM_VERSION || exit 1 sudo apt-get --no-install-suggests --no-install-recommends install g++ g++-multilib || exit 1 sudo ln -s /usr/include/asm-generic /usr/include/asm - CC=clang-$LLVM_VERSION - echo "CC=$CC" >> $GITHUB_ENV - CXX=clang++-$LLVM_VERSION - echo "CXX=$CXX" >> $GITHUB_ENV + # Export compiler as environment var + echo "CC=clang-$LLVM_VERSION" >> $GITHUB_ENV + echo "CXX=clang++-$LLVM_VERSION" >> $GITHUB_ENV + - name: Checkout xsimd uses: actions/checkout@v6 + - name: Install mamba - uses: mamba-org/setup-micromamba@v2 + uses: mamba-org/setup-micromamba@v3 with: environment-file: environment.yml + - name: Setup SDE - if: startswith(matrix.sys.flags, 'avx512') + if: startswith(matrix.sys.preset, 'avx512') run: sh install_sde.sh + - name: Configure build - env: - CC: ${{ env.CC }} - CXX: ${{ env.CXX }} run: | if [[ '${{ matrix.sys.flags }}' == 'enable_xtl_complex' ]]; then CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DENABLE_XTL_COMPLEX=ON" fi - if [[ '${{ matrix.sys.flags }}' == 'avx' ]]; then - CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=sandybridge" - fi - if [[ '${{ matrix.sys.flags }}' == 'avx_128' ]]; then - CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=sandybridge" - CXXFLAGS="$CXX_FLAGS -DXSIMD_DEFAULT_ARCH=avx_128" - fi - if [[ '${{ matrix.sys.flags }}' == 'avx2' ]]; then - CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=haswell" - fi - if [[ '${{ matrix.sys.flags }}' == 'avx2_128' ]]; then - CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=haswell" - CXXFLAGS="$CXX_FLAGS -DXSIMD_DEFAULT_ARCH=avx2_128" - fi - if [[ '${{ matrix.sys.flags }}' == 'sse3' ]]; then - CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=nocona" - fi - if [[ '${{ matrix.sys.flags }}' == 'avx512' ]]; then - CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=skylake-avx512" - fi - if [[ '${{ matrix.sys.flags }}' == 'avx512vl_128' ]]; then - CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=skylake-avx512" - CXXFLAGS="$CXX_FLAGS -DXSIMD_DEFAULT_ARCH=avx512vl_128" - fi - if [[ '${{ matrix.sys.flags }}' == 'avx512vl_256' ]]; then - CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=skylake-avx512" - CXXFLAGS="$CXX_FLAGS -DXSIMD_DEFAULT_ARCH=avx512vl_256" - fi - if [[ '${{ matrix.sys.flags }}' == 'avx512pf' ]]; then - CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=knl" - fi - if [[ '${{ matrix.sys.flags }}' == 'avx512vbmi' ]]; then - CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=cannonlake" - fi - if [[ '${{ matrix.sys.flags }}' == 'avx512vbmi2' ]]; then - CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=icelake-server" - fi - if [[ '${{ matrix.sys.flags }}' == 'avx512vnni' ]]; then - CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=knm" - fi if [[ '${{ matrix.sys.flags }}' == 'i386' ]]; then - CXX_FLAGS="$CXX_FLAGS -m32" + export CXXFLAGS="$CXXFLAGS -m32" fi - if [[ '${{ matrix.sys.flags }}' == 'force_no_instr_set' ]]; then - : - else + if [[ '${{ matrix.sys.flags }}' != 'force_no_instr_set' ]]; then CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DXSIMD_ENABLE_WERROR=ON" fi - # Cheap way of spotting uninitialized read - CXX_FLAGS="$CXX_FLAGS -ftrivial-auto-var-init=pattern" - cmake -B _build \ - -DBUILD_TESTS=ON \ - -DBUILD_BENCHMARK=ON \ - -DBUILD_EXAMPLES=ON \ - -DCMAKE_BUILD_TYPE=Release \ - -DCMAKE_C_COMPILER=$CC \ - -DCMAKE_CXX_COMPILER=$CXX \ - $CMAKE_EXTRA_ARGS \ - -DCMAKE_CXX_FLAGS='$CXX_FLAGS' \ + --preset ${{ matrix.sys.preset }} \ + -D BUILD_TESTS=ON \ + -D BUILD_BENCHMARK=ON \ + -D BUILD_EXAMPLES=ON \ + -D CMAKE_BUILD_TYPE=Release \ + -D CMAKE_C_COMPILER="${CC}" \ + -D CMAKE_CXX_COMPILER="${CXX}" \ + -D TARGET_ARCH="x86-64" \ + -D XSIMD_HARDEN_TRIVIAL_AUTO_VAR_INIT=ON \ + "${CMAKE_EXTRA_ARGS}" \ -G Ninja + - name: Build - run: cmake --build _build + run: cmake --build _build --parallel - name: Test run: | # Set CPU feature test expectations, 0 is explicit absence of the feature @@ -149,15 +111,15 @@ jobs: export XSIMD_TEST_CPU_ASSUME_RVV="0" export XSIMD_TEST_CPU_ASSUME_VSX="0" export XSIMD_TEST_CPU_ASSUME_VXE="0" - cd _build/test - if echo '${{ matrix.sys.flags }}' | grep -q 'avx512' ; then + + if echo '${{ matrix.sys.preset }}' | grep -q 'avx512' ; then # Running with emulation, must have AVX512, lower tier are checked by implications in tests export XSIMD_TEST_CPU_ASSUME_AVX512F="1" - ../../sde-external-9.48.0-2024-11-25-lin/sde64 -tgl -- ./test_xsimd + ./sde-external-9.48.0-2024-11-25-lin/sde64 -spr -- ./_build/test/test_xsimd else export XSIMD_TEST_CPU_ASSUME_SSE4_2=$(grep -q 'sse4_2' /proc/cpuinfo && echo "1" || echo "0") export XSIMD_TEST_CPU_ASSUME_AVX=$(grep -q 'avx' /proc/cpuinfo && echo "1" || echo "0") export XSIMD_TEST_CPU_ASSUME_AVX512F=$(grep -q 'avx512f' /proc/cpuinfo && echo "1" || echo "0") export XSIMD_TEST_CPU_ASSUME_MANUFACTURER="intel,amd" - ./test_xsimd + ./_build/test/test_xsimd fi diff --git a/CMakeLists.txt b/CMakeLists.txt index 66c01f281..92f722aa8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -54,6 +54,21 @@ if(ENABLE_XTL_COMPLEX) ) endif() +# Dev options +# =========== + +include (cmake/Hardening.cmake) + +option( + XSIMD_HARDEN_TRIVIAL_AUTO_VAR_INIT + "Enable -ftrivial-auto-var-init hardening flag if supported" + OFF +) + +if(XSIMD_HARDEN_TRIVIAL_AUTO_VAR_INIT) + xsimd_harden_trivial_auto_var_init(xsimd INTERFACE) +endif() + if(BUILD_TESTS) enable_testing() add_subdirectory(test) diff --git a/CMakePresets.json b/CMakePresets.json new file mode 100644 index 000000000..ed5314dda --- /dev/null +++ b/CMakePresets.json @@ -0,0 +1,214 @@ +{ + "version": 5, + "cmakeMinimumRequired": { + "major": 3, + "minor": 23, + "patch": 0 + }, + "configurePresets": [ + { + "name": "native", + "cacheVariables": { + "CMAKE_CXX_FLAGS": "$env{CXXFLAGS} -march=native" + } + }, + { + "name": "sse2", + "cacheVariables": { + "CMAKE_CXX_FLAGS": "$env{CXXFLAGS} -march=x86-64 -mno-sse4a -msse2 -mno-sse3" + } + }, + { + "name": "sse3", + "cacheVariables": { + "CMAKE_CXX_FLAGS": "$env{CXXFLAGS} -march=x86-64 -mno-sse4a -msse3 -mno-ssse3" + } + }, + { + "name": "ssse3", + "cacheVariables": { + "CMAKE_CXX_FLAGS": "$env{CXXFLAGS} -march=x86-64 -mno-sse4a -mssse3 -mno-sse4.1" + } + }, + { + "name": "sse4.1", + "cacheVariables": { + "CMAKE_CXX_FLAGS": "$env{CXXFLAGS} -march=x86-64 -mno-sse4a -msse4.1 -mno-sse4.2" + } + }, + { + "name": "sse4.2", + "cacheVariables": { + "CMAKE_CXX_FLAGS": "$env{CXXFLAGS} -march=x86-64 -mno-sse4a -msse4.2 -mno-avx" + } + }, + { + "name": "avx", + "cacheVariables": { + "CMAKE_CXX_FLAGS": "$env{CXXFLAGS} -march=x86-64-v2 -mno-sse4a -mavx -mno-avx2" + } + }, + { + "name": "avx_128", + "cacheVariables": { + "CMAKE_CXX_FLAGS": "$env{CXXFLAGS} -march=x86-64-v2 -mno-sse4a -mavx -mno-avx2 -DXSIMD_DEFAULT_ARCH=avx_128" + } + }, + { + "name": "avx2", + "cacheVariables": { + "CMAKE_CXX_FLAGS": "$env{CXXFLAGS} -march=x86-64-v2 -mno-sse4a -mavx2 -mno-avx512f" + } + }, + { + "name": "avx2_128", + "cacheVariables": { + "CMAKE_CXX_FLAGS": "$env{CXXFLAGS} -march=x86-64-v2 -mno-sse4a -mavx2 -mno-avx512f -DXSIMD_DEFAULT_ARCH=avx2_128" + } + }, + { + "name": "avx512f", + "cacheVariables": { + "CMAKE_CXX_FLAGS": "$env{CXXFLAGS} -march=x86-64-v2 -mno-sse4a -mavx -mavx2 -mavx512f -mno-avx512cd -mno-avx512dq -mno-avx512bw -mno-avx512er -mno-avx512pf -mno-avx512ifma -mno-avx512vbmi -mno-avx512vbmi2 -mno-avx512vnni" + } + }, + { + "name": "avx512cd", + "cacheVariables": { + "CMAKE_CXX_FLAGS": "$env{CXXFLAGS} -march=x86-64-v2 -mno-sse4a -mavx -mavx2 -mavx512f -mavx512cd -mno-avx512dq -mno-avx512bw -mno-avx512er -mno-avx512pf -mno-avx512ifma -mno-avx512vbmi -mno-avx512vbmi2 -mno-avx512vnni" + } + }, + { + "name": "avx512dq", + "cacheVariables": { + "CMAKE_CXX_FLAGS": "$env{CXXFLAGS} -march=x86-64-v2 -mno-sse4a -mavx -mavx2 -mavx512f -mavx512cd -mavx512dq -mno-avx512bw -mno-avx512er -mno-avx512pf -mno-avx512ifma -mno-avx512vbmi -mno-avx512vbmi2 -mno-avx512vnni" + } + }, + { + "name": "avx512bw", + "cacheVariables": { + "CMAKE_CXX_FLAGS": "$env{CXXFLAGS} -march=x86-64-v2 -mno-sse4a -mavx -mavx2 -mavx512f -mavx512cd -mavx512dq -mavx512bw -mno-avx512er -mno-avx512pf -mno-avx512ifma -mno-avx512vbmi -mno-avx512vbmi2 -mno-avx512vnni" + } + }, + { + "name": "avx512er", + "cacheVariables": { + "CMAKE_CXX_FLAGS": "$env{CXXFLAGS} -march=x86-64-v2 -mno-sse4a -mavx -mavx2 -mavx512f -mavx512cd -mavx512dq -mno-avx512bw -mavx512er -mno-avx512pf -mno-avx512ifma -mno-avx512vbmi -mno-avx512vbmi2 -mno-avx512vnni" + } + }, + { + "name": "avx512pf", + "cacheVariables": { + "CMAKE_CXX_FLAGS": "$env{CXXFLAGS} -march=x86-64-v2 -mno-sse4a -mavx -mavx2 -mavx512f -mavx512cd -mavx512dq -mno-avx512bw -mavx512er -mavx512pf -mno-avx512ifma -mno-avx512vbmi -mno-avx512vbmi2 -mno-avx512vnni" + } + }, + { + "name": "avx512ifma", + "cacheVariables": { + "CMAKE_CXX_FLAGS": "$env{CXXFLAGS} -march=x86-64-v2 -mno-sse4a -mavx -mavx2 -mavx512f -mavx512cd -mavx512dq -mavx512bw -mno-avx512er -mno-avx512pf -mavx512ifma -mno-avx512vbmi -mno-avx512vbmi2 -mno-avx512vnni" + } + }, + { + "name": "avx512vbmi", + "cacheVariables": { + "CMAKE_CXX_FLAGS": "$env{CXXFLAGS} -march=x86-64-v2 -mno-sse4a -mavx -mavx2 -mavx512f -mavx512cd -mavx512dq -mavx512bw -mno-avx512er -mno-avx512pf -mavx512ifma -mavx512vbmi -mno-avx512vbmi2 -mno-avx512vnni" + } + }, + { + "name": "avx512vbmi2", + "cacheVariables": { + "CMAKE_CXX_FLAGS": "$env{CXXFLAGS} -march=x86-64-v2 -mno-sse4a -mavx -mavx2 -mavx512f -mavx512cd -mavx512dq -mavx512bw -mno-avx512er -mno-avx512pf -mavx512ifma -mavx512vbmi -mavx512vbmi2 -mno-avx512vnni" + } + }, + { + "name": "avx512vnni_avx512bw", + "cacheVariables": { + "CMAKE_CXX_FLAGS": "$env{CXXFLAGS} -march=x86-64-v2 -mno-sse4a -mavx -mavx2 -mavx512f -mavx512cd -mavx512dq -mavx512bw -mno-avx512er -mno-avx512pf -mno-avx512ifma -mno-avx512vbmi -mno-avx512vbmi2 -mavx512vnni" + } + }, + { + "name": "avx512vnni_avx512vbmi2", + "cacheVariables": { + "CMAKE_CXX_FLAGS": "$env{CXXFLAGS} -march=x86-64-v2 -mno-sse4a -mavx -mavx2 -mavx512f -mavx512cd -mavx512dq -mavx512bw -mno-avx512er -mno-avx512pf -mavx512ifma -mavx512vbmi -mavx512vbmi2 -mavx512vnni" + } + }, + { + "name": "avx512vl_128", + "cacheVariables": { + "CMAKE_CXX_FLAGS": "$env{CXXFLAGS} -march=x86-64-v2 -mno-sse4a -mavx -mavx2 -mavx512f -mavx512cd -mavx512vl -mno-avx512dq -mno-avx512bw -mno-avx512er -mno-avx512pf -mno-avx512ifma -mno-avx512vbmi -mno-avx512vbmi2 -mno-avx512vnni -DXSIMD_DEFAULT_ARCH=avx512vl_128" + } + }, + { + "name": "avx512vl_256", + "cacheVariables": { + "CMAKE_CXX_FLAGS": "$env{CXXFLAGS} -march=x86-64-v2 -mno-sse4a -mavx -mavx2 -mavx512f -mavx512cd -mavx512vl -mno-avx512dq -mno-avx512bw -mno-avx512er -mno-avx512pf -mno-avx512ifma -mno-avx512vbmi -mno-avx512vbmi2 -mno-avx512vnni -DXSIMD_DEFAULT_ARCH=avx512vl_256" + } + }, + { + "name": "neon", + "cacheVariables": { + "CMAKE_CXX_FLAGS": "$env{CXXFLAGS} -march=armv7-a -mfpu=neon -mfloat-abi=softfp" + } + }, + { + "name": "neon64", + "cacheVariables": { + "CMAKE_CXX_FLAGS": "$env{CXXFLAGS} -march=armv8-a" + } + }, + { + "name": "sve128", + "cacheVariables": { + "CMAKE_CXX_FLAGS": "$env{CXXFLAGS} -march=armv8.2-a+sve -msve-vector-bits=128" + } + }, + { + "name": "sve256", + "cacheVariables": { + "CMAKE_CXX_FLAGS": "$env{CXXFLAGS} -march=armv8.2-a+sve -msve-vector-bits=256" + } + }, + { + "name": "sve512", + "cacheVariables": { + "CMAKE_CXX_FLAGS": "$env{CXXFLAGS} -march=armv8.2-a+sve -msve-vector-bits=512" + } + }, + { + "name": "rvv128", + "cacheVariables": { + "CMAKE_CXX_FLAGS": "$env{CXXFLAGS} -march=rv64gcv_zvl128b_zba_zbb_zbs -mrvv-vector-bits=zvl" + } + }, + { + "name": "rvv256", + "cacheVariables": { + "CMAKE_CXX_FLAGS": "$env{CXXFLAGS} -march=rv64gcv_zvl256b_zba_zbb_zbs -mrvv-vector-bits=zvl" + } + }, + { + "name": "rvv512", + "cacheVariables": { + "CMAKE_CXX_FLAGS": "$env{CXXFLAGS} -march=rv64gcv_zvl512b_zba_zbb_zbs -mrvv-vector-bits=zvl" + } + }, + { + "name": "vsx2", + "cacheVariables": { + "CMAKE_CXX_FLAGS": "$env{CXXFLAGS} -mcpu=power8 -maltivec -mvsx" + } + }, + { + "name": "vsx3", + "cacheVariables": { + "CMAKE_CXX_FLAGS": "$env{CXXFLAGS} -mcpu=power9 -maltivec -mvsx" + } + }, + { + "name": "vsx4", + "cacheVariables": { + "CMAKE_CXX_FLAGS": "$env{CXXFLAGS} -mcpu=power10 -maltivec -mvsx" + } + } + ] +} diff --git a/cmake/Hardening.cmake b/cmake/Hardening.cmake new file mode 100644 index 000000000..cc49bb378 --- /dev/null +++ b/cmake/Hardening.cmake @@ -0,0 +1,37 @@ +include(CheckCXXCompilerFlag) + + +function(xsimd_harden_trivial_auto_var_init target scope) + # Names of option parameters (without arguments) + set(options) + # Names of named parameters with a single argument + set(one_value_args PATTERN) + # Names of named parameters with a multiple arguments + set(multi_values_args) + cmake_parse_arguments(ARG "${options}" "${one_value_args}" "${multi_values_args}" ${ARGN}) + if(ARG_UNPARSED_ARGUMENTS) + message( + AUTHOR_WARNING + "Unrecoginzed options passed to ${CMAKE_CURRENT_FUNCTION}: " + "${ARG_UNPARSED_ARGUMENTS}" + ) + endif() + + if(NOT scope STREQUAL "PUBLIC" AND NOT scope STREQUAL "PRIVATE" AND NOT scope STREQUAL "INTERFACE") + message(FATAL_ERROR "scope must be PUBLIC, PRIVATE, or INTERFACE, got: ${scope}") + endif() + + if(NOT XSIMD_HARDEN_TRIVIAL_AUTO_VAR_INIT) + return() + endif() + + if(NOT ARG_PATTERN) + set(ARG_PATTERN "pattern") + endif() + + set(flag "-ftrivial-auto-var-init=${ARG_PATTERN}") + check_cxx_compiler_flag("${flag}" XSIMD_HAS_FTRIVIAL_AUTO_VAR_INIT_${ARG_PATTERN}) + if(XSIMD_HAS_FTRIVIAL_AUTO_VAR_INIT_${ARG_PATTERN}) + target_compile_options(${target} ${scope} "${flag}") + endif() +endfunction() diff --git a/include/xsimd/arch/common/xsimd_common_memory.hpp b/include/xsimd/arch/common/xsimd_common_memory.hpp index 7a1ed73a3..23b0606ac 100644 --- a/include/xsimd/arch/common/xsimd_common_memory.hpp +++ b/include/xsimd/arch/common/xsimd_common_memory.hpp @@ -13,6 +13,7 @@ #define XSIMD_COMMON_MEMORY_HPP #include "../../types/xsimd_batch_constant.hpp" +#include "../../utils/xsimd_type_traits.hpp" #include "./xsimd_common_details.hpp" #include @@ -360,88 +361,81 @@ namespace xsimd return load_unaligned(mem, convert {}, A {}); } - template - XSIMD_INLINE batch - load_masked(T_in const* mem, batch_bool_constant, convert, alignment, requires_arch) noexcept - { - constexpr std::size_t size = batch::size; - alignas(A::alignment()) std::array buffer {}; - constexpr bool mask[size] = { Values... }; - - for (std::size_t i = 0; i < size; ++i) - buffer[i] = mask[i] ? static_cast(mem[i]) : T_out(0); - - return batch::load(buffer.data(), aligned_mode {}); - } - - template - XSIMD_INLINE void - store_masked(T_out* mem, batch const& src, batch_bool_constant, alignment, requires_arch) noexcept + namespace detail { - constexpr std::size_t size = batch::size; - constexpr bool mask[size] = { Values... }; + // Compile-time dispatch tag for the common `load_masked`/ `store_masked` + // implementations: true iff we can use the int->float bitcast path (matching-size + // integer T_in/T_out with a SIMD register available for the matching + // floating-point type), false otherwise (use the scalar buffer fallback). + template + using common_masked_via_fp = std::integral_constant::value + && std::is_integral::value + && !std::is_void>::value + && types::has_simd_register, A>::value>; - for (std::size_t i = 0; i < size; ++i) - if (mask[i]) - { - mem[i] = static_cast(src.get(i)); - } - } + // Scalar-buffer fallback: works for any T_in/T_out. + template + XSIMD_INLINE batch + load_masked_common(T_in const* mem, batch_bool_constant, convert, alignment, std::false_type /* via_fp */) noexcept + { + constexpr std::size_t size = batch::size; + alignas(A::alignment()) std::array buffer {}; + constexpr bool mask[size] = { Values... }; - template - XSIMD_INLINE batch load_masked(int32_t const* mem, batch_bool_constant, convert, Mode, requires_arch) noexcept - { - const auto f = load_masked(reinterpret_cast(mem), batch_bool_constant {}, convert {}, Mode {}, A {}); - return bitwise_cast(f); - } + for (std::size_t i = 0; i < size; ++i) + buffer[i] = mask[i] ? static_cast(mem[i]) : T_out(0); - template - XSIMD_INLINE batch load_masked(uint32_t const* mem, batch_bool_constant, convert, Mode, requires_arch) noexcept - { - const auto f = load_masked(reinterpret_cast(mem), batch_bool_constant {}, convert {}, Mode {}, A {}); - return bitwise_cast(f); - } + return batch::load(buffer.data(), aligned_mode {}); + } - template - XSIMD_INLINE std::enable_if_t::value, batch> - load_masked(int64_t const* mem, batch_bool_constant, convert, Mode, requires_arch) noexcept - { - const auto d = load_masked(reinterpret_cast(mem), batch_bool_constant {}, convert {}, Mode {}, A {}); - return bitwise_cast(d); - } + // Integer-via-float bitcast: T_in == T_out == integral T with a matching + // `sized_fp_t` for which the arch has a SIMD register. + // Dispatches to the floating `load_masked` (which is arch-specialized) and bitcasts back. + template + XSIMD_INLINE batch + load_masked_common(T const* mem, batch_bool_constant, convert, Mode, std::true_type /* via_fp */) noexcept + { + using fp_t = sized_fp_t; + const auto f = ::xsimd::kernel::load_masked(reinterpret_cast(mem), batch_bool_constant {}, convert {}, Mode {}, A {}); + return bitwise_cast(f); + } - template - XSIMD_INLINE std::enable_if_t::value, batch> - load_masked(uint64_t const* mem, batch_bool_constant, convert, Mode, requires_arch) noexcept - { - const auto d = load_masked(reinterpret_cast(mem), batch_bool_constant {}, convert {}, Mode {}, A {}); - return bitwise_cast(d); - } + template + XSIMD_INLINE void + store_masked_common(T_out* mem, batch const& src, batch_bool_constant, alignment, std::false_type /* via_fp */) noexcept + { + constexpr std::size_t size = batch::size; + constexpr bool mask[size] = { Values... }; - template - XSIMD_INLINE void store_masked(int32_t* mem, batch const& src, batch_bool_constant, Mode, requires_arch) noexcept - { - store_masked(reinterpret_cast(mem), bitwise_cast(src), batch_bool_constant {}, Mode {}, A {}); - } + for (std::size_t i = 0; i < size; ++i) + if (mask[i]) + { + mem[i] = static_cast(src.get(i)); + } + } - template - XSIMD_INLINE void store_masked(uint32_t* mem, batch const& src, batch_bool_constant, Mode, requires_arch) noexcept - { - store_masked(reinterpret_cast(mem), bitwise_cast(src), batch_bool_constant {}, Mode {}, A {}); - } + template + XSIMD_INLINE void + store_masked_common(T* mem, batch const& src, batch_bool_constant, Mode, std::true_type /* via_fp */) noexcept + { + using fp_t = sized_fp_t; + ::xsimd::kernel::store_masked(reinterpret_cast(mem), bitwise_cast(src), batch_bool_constant {}, Mode {}, A {}); + } + } // namespace detail - template - XSIMD_INLINE std::enable_if_t::value> - store_masked(int64_t* mem, batch const& src, batch_bool_constant, Mode, requires_arch) noexcept + template + XSIMD_INLINE batch + load_masked(T_in const* mem, batch_bool_constant mask, convert cvt, alignment mode, requires_arch) noexcept { - store_masked(reinterpret_cast(mem), bitwise_cast(src), batch_bool_constant {}, Mode {}, A {}); + return detail::load_masked_common(mem, mask, cvt, mode, detail::common_masked_via_fp {}); } - template - XSIMD_INLINE std::enable_if_t::value> - store_masked(uint64_t* mem, batch const& src, batch_bool_constant, Mode, requires_arch) noexcept + template + XSIMD_INLINE void + store_masked(T_out* mem, batch const& src, batch_bool_constant mask, alignment mode, requires_arch) noexcept { - store_masked(reinterpret_cast(mem), bitwise_cast(src), batch_bool_constant {}, Mode {}, A {}); + detail::store_masked_common(mem, src, mask, mode, detail::common_masked_via_fp {}); } template diff --git a/include/xsimd/arch/xsimd_avx.hpp b/include/xsimd/arch/xsimd_avx.hpp index 1ee0c5b89..24736821a 100644 --- a/include/xsimd/arch/xsimd_avx.hpp +++ b/include/xsimd/arch/xsimd_avx.hpp @@ -1016,16 +1016,16 @@ namespace xsimd } // store_masked - namespace detail + namespace detail_avx { template - XSIMD_INLINE void maskstore(float* mem, batch_bool const& mask, batch const& src) noexcept + XSIMD_INLINE void maskstore(float* mem, batch const& mask, batch const& src) noexcept { _mm256_maskstore_ps(mem, mask, src); } template - XSIMD_INLINE void maskstore(double* mem, batch_bool const& mask, batch const& src) noexcept + XSIMD_INLINE void maskstore(double* mem, batch const& mask, batch const& src) noexcept { _mm256_maskstore_pd(mem, mask, src); } @@ -1035,24 +1035,30 @@ namespace xsimd XSIMD_INLINE void store_masked(T* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept { constexpr size_t half_size = batch::size / 2; + using half_arch = avx_128; // confined to lower 128-bit half → forward to 128 bit XSIMD_IF_CONSTEXPR(mask.countl_zero() >= half_size) { - constexpr auto mlo = ::xsimd::detail::lower_half(mask); - const auto lo = detail::lower_half(src); - store_masked(mem, lo, mlo, Mode {}, sse4_2 {}); + constexpr auto mlo = ::xsimd::detail::lower_half(mask); + const auto lo = xsimd::batch(detail::lower_half(src)); + store_masked(mem, lo, mlo, Mode {}, half_arch {}); } // confined to upper 128-bit half → forward to 128 bit else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= half_size) { - constexpr auto mhi = ::xsimd::detail::upper_half(mask); - const auto hi = detail::upper_half(src); - store_masked(mem + half_size, hi, mhi, Mode {}, sse4_2 {}); + constexpr auto mhi = ::xsimd::detail::upper_half(mask); + const auto hi = xsimd::batch(detail::upper_half(src)); + store_masked(mem + half_size, hi, mhi, Mode {}, half_arch {}); } else { - detail::maskstore(mem, mask.as_batch(), src); + using fp_t = sized_fp_t; + using int_t = sized_int_t; + detail_avx::maskstore( + reinterpret_cast(mem), + bitwise_cast(mask.as_batch()), + bitwise_cast(src)); } } diff --git a/include/xsimd/arch/xsimd_avx2.hpp b/include/xsimd/arch/xsimd_avx2.hpp index e2c223cc7..9b0d05a15 100644 --- a/include/xsimd/arch/xsimd_avx2.hpp +++ b/include/xsimd/arch/xsimd_avx2.hpp @@ -14,6 +14,7 @@ #include "../types/xsimd_avx2_register.hpp" #include "../types/xsimd_batch_constant.hpp" +#include "../utils/xsimd_type_traits.hpp" #include "./utils/shifts.hpp" #include @@ -175,7 +176,7 @@ namespace xsimd } // store_masked - namespace detail + namespace detail_avx2 { template XSIMD_INLINE void maskstore(int32_t* mem, __m256i mask, __m256i src) noexcept @@ -194,24 +195,29 @@ namespace xsimd XSIMD_INLINE void store_masked(T* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept { constexpr size_t lanes_per_half = batch::size / 2; + using half_arch = avx2_128; // confined to lower 128-bit half → forward to SSE XSIMD_IF_CONSTEXPR(mask.countl_zero() >= lanes_per_half) { - constexpr auto mlo = ::xsimd::detail::lower_half(mask); - const auto lo = detail::lower_half(src); - store_masked(mem, lo, mlo, Mode {}, sse4_2 {}); + constexpr auto mlo = ::xsimd::detail::lower_half(mask); + const auto lo = xsimd::batch(detail::lower_half(src)); + store_masked(mem, lo, mlo, Mode {}, half_arch {}); } // confined to upper 128-bit half → forward to SSE else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= lanes_per_half) { - constexpr auto mhi = ::xsimd::detail::upper_half(mask); - const auto hi = detail::upper_half(src); - store_masked(mem + lanes_per_half, hi, mhi, Mode {}, sse4_2 {}); + constexpr auto mhi = ::xsimd::detail::upper_half(mask); + const auto hi = xsimd::batch(detail::upper_half(src)); + store_masked(mem + lanes_per_half, hi, mhi, Mode {}, half_arch {}); } else { - detail::maskstore(mem, mask.as_batch(), src); + using int_t = sized_int_t; + detail_avx2::maskstore( + reinterpret_cast(mem), + mask.as_batch(), + bitwise_cast(src)); } } diff --git a/include/xsimd/arch/xsimd_avx512f.hpp b/include/xsimd/arch/xsimd_avx512f.hpp index 6a7316722..879c56a07 100644 --- a/include/xsimd/arch/xsimd_avx512f.hpp +++ b/include/xsimd/arch/xsimd_avx512f.hpp @@ -335,13 +335,13 @@ namespace xsimd XSIMD_IF_CONSTEXPR(mask.countl_zero() >= half) // lower-half AVX2 forwarding { constexpr auto mlo = ::xsimd::detail::lower_half(mask); - const auto lo = detail::lower_half(src); + const auto lo = xsimd::batch(detail::lower_half(src)); store_masked(mem, lo, mlo, Mode {}, avx2 {}); } else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= half) // upper-half AVX2 forwarding { constexpr auto mhi = ::xsimd::detail::upper_half(mask); - const auto hi = detail::upper_half(src); + const auto hi = xsimd::batch(detail::upper_half(src)); store_masked(mem + half, hi, mhi, Mode {}, avx2 {}); } else diff --git a/include/xsimd/arch/xsimd_common_fwd.hpp b/include/xsimd/arch/xsimd_common_fwd.hpp index f5a7f4ffe..e9ecf79ad 100644 --- a/include/xsimd/arch/xsimd_common_fwd.hpp +++ b/include/xsimd/arch/xsimd_common_fwd.hpp @@ -87,22 +87,6 @@ namespace xsimd XSIMD_INLINE batch load_masked(T_in const* mem, batch_bool_constant mask, convert, alignment, requires_arch) noexcept; template XSIMD_INLINE void store_masked(T_out* mem, batch const& src, batch_bool_constant mask, alignment, requires_arch) noexcept; - template - XSIMD_INLINE batch load_masked(int32_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept; - template - XSIMD_INLINE batch load_masked(uint32_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept; - template - XSIMD_INLINE std::enable_if_t::value, batch> load_masked(int64_t const*, batch_bool_constant, convert, Mode, requires_arch) noexcept; - template - XSIMD_INLINE std::enable_if_t::value, batch> load_masked(uint64_t const*, batch_bool_constant, convert, Mode, requires_arch) noexcept; - template - XSIMD_INLINE void store_masked(int32_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept; - template - XSIMD_INLINE void store_masked(uint32_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept; - template - XSIMD_INLINE std::enable_if_t::value> store_masked(int64_t*, batch const&, batch_bool_constant, Mode, requires_arch) noexcept; - template - XSIMD_INLINE std::enable_if_t::value> store_masked(uint64_t*, batch const&, batch_bool_constant, Mode, requires_arch) noexcept; // Forward declarations for pack-level helpers namespace detail diff --git a/include/xsimd/arch/xsimd_sse2.hpp b/include/xsimd/arch/xsimd_sse2.hpp index c6cfb5f07..0a95aae8b 100644 --- a/include/xsimd/arch/xsimd_sse2.hpp +++ b/include/xsimd/arch/xsimd_sse2.hpp @@ -2331,7 +2331,7 @@ namespace xsimd } else { - store_masked(mem, src, mask, requires_arch {}); + store_masked(mem, src, mask, aligned_mode {}, common {}); } }