Skip to content
Open
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,10 @@ OPTION(EMBREE_MIN_WIDTH "Enables min-width feature to enlarge curve and point th
IF (APPLE AND CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND (CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64" AND CMAKE_OSX_ARCHITECTURES STREQUAL "") OR ("arm64" IN_LIST CMAKE_OSX_ARCHITECTURES))
MESSAGE(STATUS "Building for Apple silicon")
SET(EMBREE_ARM ON)
# CMAKE_SYSTEM_PROCESSOR is unreliable on windows where it would report AMD64 with cross compilation
ELSEIF(CMAKE_SYSTEM_NAME STREQUAL "Windows" AND CMAKE_GENERATOR_PLATFORM STREQUAL "ARM64")
MESSAGE(STATUS "Building for Windows ARM64 (MSVC)")
SET(EMBREE_ARM ON)
ELSEIF(CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "ARM64")
MESSAGE(STATUS "Building for AArch64")
SET(EMBREE_ARM ON)
Expand Down
2 changes: 1 addition & 1 deletion common/cmake/check_arm_neon.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
// Copyright 2009-2020 Intel Corporation
// SPDX-License-Identifier: Apache-2.0

#if !defined(__ARM_NEON)
#if !defined(__ARM_NEON) && !defined(_M_ARM64)
#error "No ARM Neon support"
#endif

Expand Down
21 changes: 16 additions & 5 deletions common/cmake/msvc.cmake
Original file line number Diff line number Diff line change
@@ -1,11 +1,18 @@
## Copyright 2009-2021 Intel Corporation
## SPDX-License-Identifier: Apache-2.0

SET(FLAGS_SSE2 "/D__SSE__ /D__SSE2__")
SET(FLAGS_SSE42 "${FLAGS_SSE2} /D__SSE3__ /D__SSSE3__ /D__SSE4_1__ /D__SSE4_2__")
SET(FLAGS_AVX "${FLAGS_SSE42} /arch:AVX")
SET(FLAGS_AVX2 "${FLAGS_SSE42} /arch:AVX2")
SET(FLAGS_AVX512 "${FLAGS_AVX2} /arch:AVX512")
IF (EMBREE_ARM)
SET(FLAGS_SSE2 "/D__SSE__ /D__SSE2__")
SET(FLAGS_SSE42 "/D__SSE4_2__ /D__SSE4_1__")
SET(FLAGS_AVX "/D__AVX__ /D__SSE4_2__ /D__SSE4_1__ /D__BMI__ /D__BMI2__ /D__LZCNT__")
SET(FLAGS_AVX2 "/D__AVX2__ /D__AVX__ /D__SSE4_2__ /D__SSE4_1__ /D__BMI__ /D__BMI2__ /D__LZCNT__")
ELSE()
SET(FLAGS_SSE2 "/D__SSE__ /D__SSE2__")
SET(FLAGS_SSE42 "${FLAGS_SSE2} /D__SSE3__ /D__SSSE3__ /D__SSE4_1__ /D__SSE4_2__")
SET(FLAGS_AVX "${FLAGS_SSE42} /arch:AVX")
SET(FLAGS_AVX2 "${FLAGS_SSE42} /arch:AVX2")
SET(FLAGS_AVX512 "${FLAGS_AVX2} /arch:AVX512")
ENDIF()

SET(COMMON_CXX_FLAGS "")
SET(COMMON_CXX_FLAGS "${COMMON_CXX_FLAGS} /EHsc") # catch C++ exceptions only and extern "C" functions never throw a C++ exception
Expand All @@ -17,6 +24,10 @@ IF (EMBREE_STACK_PROTECTOR)
ELSE()
SET(COMMON_CXX_FLAGS "${COMMON_CXX_FLAGS} /GS-") # do not protect against return address overrides
ENDIF()
IF (EMBREE_ARM)
# sse2neon uses the new preprocessor
SET(COMMON_CXX_FLAGS "${COMMON_CXX_FLAGS} /Zc:preprocessor")
ENDIF()
MACRO(DISABLE_STACK_PROTECTOR_FOR_FILE file)
IF (EMBREE_STACK_PROTECTOR)
SET_SOURCE_FILES_PROPERTIES(${file} PROPERTIES COMPILE_FLAGS "/GS-")
Expand Down
4 changes: 2 additions & 2 deletions common/math/bbox.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ namespace embree
return lower > upper;
}

#if defined(__SSE__) || defined(__ARM_NEON)
#if defined(__SSE__) || defined(__ARM_NEON) || defined(_M_ARM64)
template<> __forceinline bool BBox<Vec3fa>::empty() const {
return !all(le_mask(lower,upper));
}
Expand Down Expand Up @@ -233,7 +233,7 @@ namespace embree
/// SSE / AVX / MIC specializations
////////////////////////////////////////////////////////////////////////////////

#if defined (__SSE__) || defined(__ARM_NEON)
#if defined (__SSE__) || defined(__ARM_NEON) || defined(_M_ARM64)
#include "../simd/sse.h"
#endif

Expand Down
8 changes: 4 additions & 4 deletions common/math/color.h
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ namespace embree
}
__forceinline const Color rcp ( const Color& a )
{
#if defined(__aarch64__)
#if defined(__aarch64__) || defined(_M_ARM64)
__m128 reciprocal = _mm_rcp_ps(a.m128);
reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal);
reciprocal = vmulq_f32(vrecpsq_f32(a.m128, reciprocal), reciprocal);
Expand All @@ -173,11 +173,11 @@ namespace embree
#endif
return _mm_add_ps(r,_mm_mul_ps(r, _mm_sub_ps(_mm_set1_ps(1.0f), _mm_mul_ps(a, r)))); // computes r + r * (1 - a * r)

#endif //defined(__aarch64__)
#endif //defined(__aarch64__) || defined(_M_ARM64)
}
__forceinline const Color rsqrt( const Color& a )
{
#if defined(__aarch64__)
#if defined(__aarch64__) || defined(_M_ARM64)
__m128 r = _mm_rsqrt_ps(a.m128);
r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(a.m128, r), r));
Expand All @@ -191,7 +191,7 @@ namespace embree
#endif
return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r)));

#endif //defined(__aarch64__)
#endif //defined(__aarch64__) || defined(_M_ARM64)
}
__forceinline const Color sqrt ( const Color& a ) { return _mm_sqrt_ps(a.m128); }

Expand Down
89 changes: 75 additions & 14 deletions common/math/emath.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# include "math_sycl.h"
#else

#if defined(__ARM_NEON)
#if defined(__ARM_NEON) || defined(_M_ARM64)
#include "../simd/arm/emulation.h"
#else
#include <emmintrin.h>
Expand Down Expand Up @@ -60,14 +60,22 @@ namespace embree

__forceinline float rcp ( const float x )
{
#if defined(__aarch64__)
#if defined(__aarch64__) || defined(_M_ARM64)
// Move scalar to vector register and do rcp.
__m128 a;
#if !defined(_M_ARM64)
a[0] = x;
#else
a.n128_f32[0] = x;
#endif
float32x4_t reciprocal = vrecpeq_f32(a);
reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal);
reciprocal = vmulq_f32(vrecpsq_f32(a, reciprocal), reciprocal);
#if !defined(_M_ARM64)
return reciprocal[0];
#else
return reciprocal.n128_f32[0];
#endif
#else

const __m128 a = _mm_set_ss(x);
Expand All @@ -84,58 +92,93 @@ namespace embree
return _mm_cvtss_f32(_mm_mul_ss(r,_mm_sub_ss(_mm_set_ss(2.0f), _mm_mul_ss(r, a))));
#endif

#endif //defined(__aarch64__)
#endif //defined(__aarch64__) || defined(_M_ARM64)
}

__forceinline float signmsk ( const float x ) {
#if defined(__aarch64__)
#if defined(__aarch64__) || defined(_M_ARM64)
// FP and Neon shares same vector register in arm64
__m128 a;
__m128i b;
#if !defined(_M_ARM64)
a[0] = x;
b[0] = 0x80000000;
#else
a.n128_f32[0] = x;
b.n128_i32[0] = 0x80000000;
#endif
a = _mm_and_ps(a, vreinterpretq_f32_s32(b));
#if !defined(_M_ARM64)
return a[0];
#else
return a.n128_f32[0];
#endif
#else
return _mm_cvtss_f32(_mm_and_ps(_mm_set_ss(x),_mm_castsi128_ps(_mm_set1_epi32(0x80000000))));
#endif
}
__forceinline float xorf( const float x, const float y ) {
#if defined(__aarch64__)
#if defined(__aarch64__) || defined(_M_ARM64)
// FP and Neon shares same vector register in arm64
__m128 a;
__m128 b;
#if !defined(_M_ARM64)
a[0] = x;
b[0] = y;
#else
a.n128_f32[0] = x;
b.n128_f32[0] = y;
#endif
a = _mm_xor_ps(a, b);
#if !defined(_M_ARM64)
return a[0];
#else
return a.n128_f32[0];
#endif
#else
return _mm_cvtss_f32(_mm_xor_ps(_mm_set_ss(x),_mm_set_ss(y)));
#endif
}
__forceinline float andf( const float x, const unsigned y ) {
#if defined(__aarch64__)
#if defined(__aarch64__) || defined(_M_ARM64)
// FP and Neon shares same vector register in arm64
__m128 a;
__m128i b;
#if !defined(_M_ARM64)
a[0] = x;
b[0] = y;
#else
a.n128_f32[0] = x;
b.n128_u32[0] = y;
#endif
a = _mm_and_ps(a, vreinterpretq_f32_s32(b));
#if !defined(_M_ARM64)
return a[0];
#else
return a.n128_f32[0];
#endif
#else
return _mm_cvtss_f32(_mm_and_ps(_mm_set_ss(x),_mm_castsi128_ps(_mm_set1_epi32(y))));
#endif
}
__forceinline float rsqrt( const float x )
{
#if defined(__aarch64__)
#if defined(__aarch64__) || defined(_M_ARM64)
// FP and Neon shares same vector register in arm64
__m128 a;
#if !defined(_M_ARM64)
a[0] = x;
#else
a.n128_f32[0] = x;
#endif
__m128 value = _mm_rsqrt_ps(a);
value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(a, value), value));
value = vmulq_f32(value, vrsqrtsq_f32(vmulq_f32(a, value), value));
#if !defined(_M_ARM64)
return value[0];
#else
return value.n128_f32[0];
#endif
#else

const __m128 a = _mm_set_ss(x);
Expand Down Expand Up @@ -204,15 +247,24 @@ namespace embree
__forceinline double floor( const double x ) { return ::floor (x); }
__forceinline double ceil ( const double x ) { return ::ceil (x); }

#if defined(__aarch64__)
#if defined(__aarch64__) || defined(_M_ARM64)
__forceinline float mini(float a, float b) {
// FP and Neon shares same vector register in arm64
__m128 x;
__m128 y;
#if !defined(_M_ARM64)
x[0] = a;
y[0] = b;
#else
x.n128_f32[0] = a;
y.n128_f32[0] = b;
#endif
x = _mm_min_ps(x, y);
return x[0];
#if !defined(_M_ARM64)
return x[0];
#else
return x.n128_f32[0];
#endif
}
#elif defined(__SSE4_1__)
__forceinline float mini(float a, float b) {
Expand All @@ -223,15 +275,24 @@ namespace embree
}
#endif

#if defined(__aarch64__)
#if defined(__aarch64__) || defined(_M_ARM64)
__forceinline float maxi(float a, float b) {
// FP and Neon shares same vector register in arm64
__m128 x;
__m128 y;
#if !defined(_M_ARM64)
x[0] = a;
y[0] = b;
#else
x.n128_f32[0] = a;
y.n128_f32[0] = b;
#endif
x = _mm_max_ps(x, y);
return x[0];
#if !defined(_M_ARM64)
return x[0];
#else
return x.n128_f32[0];
#endif
}
#elif defined(__SSE4_1__)
__forceinline float maxi(float a, float b) {
Expand All @@ -250,7 +311,7 @@ namespace embree
__forceinline int64_t min(int64_t a, int64_t b) { return a<b ? a:b; }
__forceinline float min(float a, float b) { return a<b ? a:b; }
__forceinline double min(double a, double b) { return a<b ? a:b; }
#if defined(__64BIT__) || defined(__EMSCRIPTEN__)
#if defined(__64BIT__) || defined(__EMSCRIPTEN__) || (defined(_M_ARM64) && !defined(__clang__))
__forceinline size_t min(size_t a, size_t b) { return a<b ? a:b; }
#endif
#if defined(__EMSCRIPTEN__)
Expand All @@ -270,7 +331,7 @@ namespace embree
__forceinline int64_t max(int64_t a, int64_t b) { return a<b ? b:a; }
__forceinline float max(float a, float b) { return a<b ? b:a; }
__forceinline double max(double a, double b) { return a<b ? b:a; }
#if defined(__64BIT__) || defined(__EMSCRIPTEN__)
#if defined(__64BIT__) || defined(__EMSCRIPTEN__) || (defined(_M_ARM64) && !defined(__clang__))
__forceinline size_t max(size_t a, size_t b) { return a<b ? b:a; }
#endif
#if defined(__EMSCRIPTEN__)
Expand Down Expand Up @@ -423,7 +484,7 @@ __forceinline float nmsub ( const float a, const float b, const float c) { retur
return x | (y << 1) | (z << 2);
}

#if defined(__AVX2__) && !defined(__aarch64__)
#if defined(__AVX2__) && !defined(__aarch64__) && !defined(_M_ARM64)

template<>
__forceinline unsigned int bitInterleave(const unsigned int &xi, const unsigned int& yi, const unsigned int& zi)
Expand Down
2 changes: 1 addition & 1 deletion common/math/linearspace3.h
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ namespace embree
/*! compute transposed matrix */
template<> __forceinline const LinearSpace3<Vec3fa> LinearSpace3<Vec3fa>::transposed() const {
vfloat4 rx,ry,rz; transpose((vfloat4&)vx,(vfloat4&)vy,(vfloat4&)vz,vfloat4(zero),rx,ry,rz);
return LinearSpace3<Vec3fa>(Vec3fa(rx),Vec3fa(ry),Vec3fa(rz));
return LinearSpace3<Vec3fa>(Vec3fa(rx.m128()),Vec3fa(ry.m128()),Vec3fa(rz.m128()));
}
#endif

Expand Down
4 changes: 2 additions & 2 deletions common/math/vec2.h
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@ namespace embree

#include "vec2fa.h"

#if defined(__SSE__) || defined(__ARM_NEON)
#if defined(__SSE__) || defined(__ARM_NEON) || defined(_M_ARM64)
#include "../simd/sse.h"
#endif

Expand All @@ -221,7 +221,7 @@ namespace embree
{
template<> __forceinline Vec2<float>::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {}

#if defined(__SSE__) || defined(__ARM_NEON)
#if defined(__SSE__) || defined(__ARM_NEON) || defined(_M_ARM64)
template<> __forceinline Vec2<vfloat4>::Vec2(const Vec2fa& a) : x(a.x), y(a.y) {}
#endif

Expand Down
Loading
Loading