diff --git a/src/CodeGen_ARM.cpp b/src/CodeGen_ARM.cpp index abfff62a431a..a9a80ac0ebad 100644 --- a/src/CodeGen_ARM.cpp +++ b/src/CodeGen_ARM.cpp @@ -1,3 +1,4 @@ +#include #include #include @@ -8,6 +9,7 @@ #include "Debug.h" #include "DecomposeVectorShuffle.h" #include "DistributeShifts.h" +#include "FindIntrinsics.h" #include "IREquality.h" #include "IRMatch.h" #include "IRMutator.h" @@ -182,6 +184,7 @@ class CodeGen_ARM : public CodeGen_Posix { /** Similar to llvm_type_of, but allows providing a VectorTypeConstraint to * force Fixed or VScale vector results. */ llvm::Type *llvm_type_with_constraint(const Type &t, bool scalars_are_vectors, VectorTypeConstraint constraint); + llvm::Type *llvm_type_with_constraint(const Type &t, bool scalars_are_vectors, VectorTypeConstraint constraint, int vscale); /** Define a wrapper LLVM func that takes some arguments which Halide defines * and call inner LLVM intrinsic with an additional argument which LLVM requires. */ @@ -190,12 +193,17 @@ class CodeGen_ARM : public CodeGen_Posix { const std::string &mangled_name, const std::vector &arg_types, int intrinsic_flags, - bool sve_intrinsic); + bool sve_intrinsic, + int vscale); void init_module() override; void compile_func(const LoweredFunc &f, const std::string &simple_name, const std::string &extern_name) override; + /** Determine feasible vscale (vector_bits/128 or 0) by checking vector lanes used in the function. + * Raise user_warning in case of not feasible */ + int check_feasible_vscale(int vector_bits, const std::set &lanes_used, const std::string &simple_name); + /** Nodes for which we want to emit specific ARM vector intrinsics */ // @{ void visit(const Cast *) override; @@ -210,6 +218,7 @@ class CodeGen_ARM : public CodeGen_Posix { void visit(const Call *) override; void visit(const LT *) override; void visit(const LE *) override; + Value *call_overloaded_intrin(const Type &result_type, const std::string &name, const std::vector &args) override; llvm::Type *get_vector_type_from_value(llvm::Value *vec_or_scalar, int n); Value *concat_vectors(const std::vector &) override; @@ -255,12 +264,14 @@ class CodeGen_ARM : public CodeGen_Posix { string mattrs() const override; bool use_soft_float_abi() const override; int native_vector_bits() const override; - int target_vscale() const override; + int target_vscale() const override { + return feasible_vscale; + } // NEON can be disabled for older processors. bool simd_intrinsics_disabled() { return target.has_feature(Target::NoNEON) && - !target.has_feature(Target::SVE2); + target_vscale() == 0; } bool is_float16_and_has_feature(const Type &t) const { @@ -282,6 +293,11 @@ class CodeGen_ARM : public CodeGen_Posix { } friend struct DecomposeVectorShuffle; + + int feasible_vscale = 0; + IntrinsicsMap intrinsics_neon; + IntrinsicsMap intrinsics_sve2; + IntrinsicsMap *effective_intrinsics; }; CodeGen_ARM::CodeGen_ARM(const Target &target) @@ -917,13 +933,18 @@ const std::map float16_transcendental_remapping = { llvm::Type *CodeGen_ARM::llvm_type_with_constraint(const Type &t, bool scalars_are_vectors, VectorTypeConstraint constraint) { + return llvm_type_with_constraint(t, scalars_are_vectors, constraint, target_vscale()); +} + +llvm::Type *CodeGen_ARM::llvm_type_with_constraint(const Type &t, bool scalars_are_vectors, + VectorTypeConstraint constraint, int vscale) { llvm::Type *ret = llvm_type_of(t.element_of()); if (!t.is_scalar() || scalars_are_vectors) { int lanes = t.lanes(); if (constraint == VectorTypeConstraint::VScale) { - lanes /= target_vscale(); + lanes /= vscale; } - ret = get_vector_type(ret, lanes, constraint); + ret = VectorType::get(ret, lanes, constraint == VectorTypeConstraint::VScale); } return ret; } @@ -933,11 +954,13 @@ llvm::Function *CodeGen_ARM::define_intrin_wrapper(const std::string &inner_name const std::string &mangled_name, const std::vector &arg_types, int intrinsic_flags, - bool sve_intrinsic) { + bool sve_intrinsic, + int vscale) { auto to_llvm_type = [&](const Type &t) { return llvm_type_with_constraint(t, (intrinsic_flags & ArmIntrinsic::ScalarsAreVectors), - !sve_intrinsic ? VectorTypeConstraint::Fixed : VectorTypeConstraint::VScale); + sve_intrinsic ? VectorTypeConstraint::VScale : VectorTypeConstraint::Fixed, + vscale); }; llvm::Type *llvm_ret_type = to_llvm_type(ret_type); @@ -999,6 +1022,8 @@ llvm::Function *CodeGen_ARM::define_intrin_wrapper(const std::string &inner_name if (split_arg0) { // Call the real intrinsic. + internal_assert(!sve_intrinsic) + << "Cannot handle slice_vector for sve_intrinsic as vscale has not been set at this point\n"; Value *low = slice_vector(wrapper->getArg(0), 0, inner_lanes); Value *high = slice_vector(wrapper->getArg(0), inner_lanes, inner_lanes); inner_args.push_back(low); @@ -1079,7 +1104,23 @@ void CodeGen_ARM::init_module() { // scaled, and one of two opcodes may be selected by different // iterations of this loop. for (const auto flavor : flavors) { - const bool is_sve = (flavor == SIMDFlavors::SVE); + const bool is_sve = flavor == SIMDFlavors::SVE; + + int vscale = 0; + IntrinsicsMap *intrinsics_map = nullptr; + switch (flavor) { + case SIMDFlavors::NeonWidthX1: + case SIMDFlavors::NeonWidthX2: + intrinsics_map = &intrinsics_neon; + break; + case SIMDFlavors::SVE: + vscale = target.vector_bits / 128; + intrinsics_map = &intrinsics_sve2; + break; + default: + internal_error << "unreachable\n"; + break; + } // Skip intrinsics that are NEON or SVE only depending on whether compiling for SVE. if (is_sve) { @@ -1125,7 +1166,7 @@ void CodeGen_ARM::init_module() { break; case SIMDFlavors::SVE: width_factor = (intrin.flags & ArmIntrinsic::HalfWidth) ? 2 : 1; - width_factor *= target_vscale(); + width_factor *= vscale; break; } } @@ -1165,7 +1206,7 @@ void CodeGen_ARM::init_module() { } for (const Type &t : types) { std::string llvm_vector_prefix = is_sve ? ".nxv" : ".v"; - int mangle_lanes = t.lanes() / (is_sve ? target_vscale() : 1); + int mangle_lanes = t.lanes() / (is_sve ? vscale : 1); mangled_name_builder << llvm_vector_prefix << mangle_lanes; if (t.is_int() || t.is_uint()) { mangled_name_builder << "i"; @@ -1179,26 +1220,111 @@ void CodeGen_ARM::init_module() { llvm::Function *intrin_impl = define_intrin_wrapper( intrin.name, ret_type, mangled_name, arg_types, - intrin.flags, is_sve); + intrin.flags, is_sve, vscale); function_does_not_access_memory(intrin_impl); intrin_impl->addFnAttr(llvm::Attribute::NoUnwind); - declare_intrin_overload(intrin.name, ret_type, intrin_impl, arg_types); + (*intrinsics_map)[intrin.name].emplace_back(ret_type, arg_types, intrin_impl); + if (intrin.flags & ArmIntrinsic::AllowUnsignedOp1) { // Also generate a version of this intrinsic where the second operand is unsigned. arg_types[1] = arg_types[1].with_code(halide_type_uint); - declare_intrin_overload(intrin.name, ret_type, intrin_impl, arg_types); + (*intrinsics_map)[intrin.name].emplace_back(ret_type, arg_types, intrin_impl); } } } } +// Traverse the IR graph and gather lanes of vector type. +// Note: we could derive this from CodeGen_C::TypeInfoGatherer +class VectorLanesGatherer : public IRGraphVisitor { +private: + using IRGraphVisitor::include; + using IRGraphVisitor::visit; + + void include_lanes(const Type &t) { + if (t.is_vector()) { + if (!t.is_handle()) { + // Vector-handle types can be seen when processing (e.g.) + // require() statements that are vectorized, but they + // will all be scalarized away prior to use, so don't emit + // them. + lanes_used.insert(t.lanes()); + } + } + } + +protected: + void include(const Expr &e) override { + include_lanes(e.type()); + IRGraphVisitor::include(e); + } + + void visit(const Ramp *op) override { + include_lanes(op->type.with_lanes(op->lanes)); + IRGraphVisitor::visit(op); + } + + void visit(const Broadcast *op) override { + include_lanes(op->type.with_lanes(op->lanes)); + IRGraphVisitor::visit(op); + } + + void visit(const Call *op) override { + include_lanes(op->type); + if (op->is_intrinsic()) { + Expr lowered = lower_intrinsic(op); + if (lowered.defined()) { + lowered.accept(this); + return; + } + } + + IRGraphVisitor::visit(op); + } + +public: + std::set lanes_used; +}; + void CodeGen_ARM::compile_func(const LoweredFunc &f, const string &simple_name, const string &extern_name) { LoweredFunc func = f; + if (target.os != Target::IOS && target.os != Target::OSX) { + // Substitute in strided loads to get vld2/3/4 emission. We don't do it + // on Apple silicon, because doing a dense load and then shuffling is + // actually faster. + func.body = SubstituteInStridedLoads()(func.body); + } + // Look for opportunities to turn a + (b << c) into umlal/smlal + // and a - (b << c) into umlsl/smlsl. + func.body = distribute_shifts(func.body, /* multiply_adds */ true); + + // Inspect vector lanes used in this function to determine feasible vscale. + // TODO: Target::SVE not supported https://github.com/halide/Halide/issues/8872 + feasible_vscale = 0; + if (target.features_any_of({Target::SVE2})) { + VectorLanesGatherer vector_lanes_gatherer; + func.body.accept(&vector_lanes_gatherer); + feasible_vscale = check_feasible_vscale(target.vector_bits, vector_lanes_gatherer.lanes_used, simple_name); + } + + if (feasible_vscale > 0) { + // Add attribute vscale_range + llvm::Function *llvm_func = module->getFunction(extern_name); + internal_assert(llvm_func); + llvm_func->addFnAttr(llvm::Attribute::getWithVScaleRangeArgs( + module->getContext(), feasible_vscale, feasible_vscale)); + } + + // Select intrinsics map for neon or sve2, depending on vscale + effective_intrinsics = feasible_vscale > 0 ? &intrinsics_sve2 : &intrinsics_neon; + + CodeGen_Posix::set_effective_vscale(feasible_vscale); + // Make sure run-time vscale is equal to compile-time vscale. // Avoiding the assert on inner functions is both an efficiency and a correctness issue // as the assertion code may not compile in all contexts. @@ -1213,17 +1339,38 @@ void CodeGen_ARM::compile_func(const LoweredFunc &f, } } - if (target.os != Target::IOS && target.os != Target::OSX) { - // Substitute in strided loads to get vld2/3/4 emission. We don't do it - // on Apple silicon, because doing a dense load and then shuffling is - // actually faster. - func.body = SubstituteInStridedLoads()(func.body); + CodeGen_Posix::compile_func(func, simple_name, extern_name); +} + +int CodeGen_ARM::check_feasible_vscale(int vector_bits, const std::set &lanes_used, const std::string &simple_name) { + internal_assert(vector_bits != 0 && (vector_bits % 128) == 0); + int vscale = vector_bits / 128; + bool feasible = true; + + for (const auto lanes : lanes_used) { + if (lanes <= 1) { + // Accept scalar + continue; + } + + // Check what llvm vector type could be emitted. + // 1. N must be integer based on LLVM IR spec. + // 2. N of odd number is excluded for now because LLVM aarch64 backend has some issue. + if ((lanes % vscale) || (lanes / vscale % 2)) { + feasible = false; + break; + } } - // Look for opportunities to turn a + (b << c) into umlal/smlal - // and a - (b << c) into umlsl/smlsl. - func.body = distribute_shifts(func.body, /* multiply_adds */ true); - CodeGen_Posix::compile_func(func, simple_name, extern_name); + if (!feasible) { + user_warning << "In " << simple_name + << ", Vectorization factor is not suitable of scalable vector with " + << "vector_bits=" << vector_bits + << ". Disabling SVE\n"; + return 0; + } + + return vscale; } void CodeGen_ARM::visit(const Cast *op) { @@ -1265,7 +1412,7 @@ void CodeGen_ARM::visit(const Cast *op) { // LLVM fptoui generates fcvtzs or fcvtzu in inconsistent way if (op->value.type().is_float() && op->type.is_int_or_uint() && - !target.has_feature(Target::SVE2)) { + target_vscale() == 0) { if (Value *v = call_overloaded_intrin(op->type, "fp_to_int", {op->value})) { value = v; return; @@ -1435,7 +1582,7 @@ void CodeGen_ARM::visit(const Sub *op) { void CodeGen_ARM::visit(const Min *op) { // Use a 2-wide vector for scalar floats. if (!simd_intrinsics_disabled() && - ((op->type.is_float() && !target.has_feature(Target::SVE2)) || + ((op->type.is_float() && target_vscale() == 0) || op->type.is_vector())) { value = call_overloaded_intrin(op->type, "min", {op->a, op->b}); if (value) { @@ -1449,7 +1596,7 @@ void CodeGen_ARM::visit(const Min *op) { void CodeGen_ARM::visit(const Max *op) { // Use a 2-wide vector for scalar floats. if (!simd_intrinsics_disabled() && - ((op->type.is_float() && !target.has_feature(Target::SVE2)) || + ((op->type.is_float() && target_vscale() == 0) || op->type.is_vector())) { value = call_overloaded_intrin(op->type, "max", {op->a, op->b}); if (value) { @@ -1463,7 +1610,7 @@ void CodeGen_ARM::visit(const Max *op) { void CodeGen_ARM::visit(const Store *op) { // Predicated store const bool is_predicated_store = !is_const_one(op->predicate); - if (is_predicated_store && !target.has_feature(Target::SVE2)) { + if (is_predicated_store && target_vscale() == 0) { CodeGen_Posix::visit(op); return; } @@ -1477,7 +1624,7 @@ void CodeGen_ARM::visit(const Store *op) { const Ramp *ramp = op->index.as(); // We only deal with ramps here except for SVE2 - if (!ramp && !target.has_feature(Target::SVE2)) { + if (!ramp && target_vscale() == 0) { CodeGen_Posix::visit(op); return; } @@ -1503,14 +1650,12 @@ void CodeGen_ARM::visit(const Store *op) { is_float16_and_has_feature(elt) || elt == Int(8) || elt == Int(16) || elt == Int(32) || elt == Int(64) || elt == UInt(8) || elt == UInt(16) || elt == UInt(32) || elt == UInt(64)) { - const int target_vector_bits = native_vector_bits(); - if (vec_bits % 128 == 0) { + if (vec_bits % 128 == 0 || target_vscale() > 0) { type_ok_for_vst = true; - intrin_type = intrin_type.with_lanes(target_vector_bits / t.bits()); + intrin_type = intrin_type.with_lanes(native_vector_bits() / t.bits()); } else if (vec_bits % 64 == 0) { type_ok_for_vst = true; - auto intrin_bits = (vec_bits % 128 == 0 || target.has_feature(Target::SVE2)) ? target_vector_bits : 64; - intrin_type = intrin_type.with_lanes(intrin_bits / t.bits()); + intrin_type = intrin_type.with_lanes(64 / t.bits()); } } } @@ -1540,7 +1685,7 @@ void CodeGen_ARM::visit(const Store *op) { args[i] = codegen(shuffle->vectors[i]); } - bool is_sve = target.has_feature(Target::SVE2); + bool is_sve = (target_vscale() > 0); // Declare the function std::ostringstream instr; @@ -1636,12 +1781,12 @@ void CodeGen_ARM::visit(const Store *op) { return; } - if (target.has_feature(Target::SVE2)) { + if (target_vscale() > 0) { const IntImm *stride = ramp ? ramp->stride.as() : nullptr; if (stride && stride->value == 1) { // Basically we can deal with vanilla codegen, // but to avoid LLVM error, process with the multiple of natural_lanes - const int natural_lanes = target.natural_vector_size(op->value.type()); + const int natural_lanes = natural_vector_size(op->value.type()); if (ramp->lanes % natural_lanes && !emit_atomic_stores) { int aligned_lanes = align_up(ramp->lanes, natural_lanes); // Use predicate to prevent overrun @@ -1673,7 +1818,7 @@ void CodeGen_ARM::visit(const Store *op) { const int index_bits = 32; Type type_with_max_bits = Int(std::max(elt.bits(), index_bits)); // The number of lanes is constrained by index vector type - const int natural_lanes = target.natural_vector_size(type_with_max_bits); + const int natural_lanes = natural_vector_size(type_with_max_bits); const int vscale_natural_lanes = natural_lanes / target_vscale(); Expr base = 0; @@ -1759,7 +1904,7 @@ void CodeGen_ARM::visit(const Store *op) { void CodeGen_ARM::visit(const Load *op) { // Predicated load const bool is_predicated_load = !is_const_one(op->predicate); - if (is_predicated_load && !target.has_feature(Target::SVE2)) { + if (is_predicated_load && target_vscale() == 0) { CodeGen_Posix::visit(op); return; } @@ -1772,7 +1917,7 @@ void CodeGen_ARM::visit(const Load *op) { const Ramp *ramp = op->index.as(); // We only deal with ramps here - if (!ramp && !target.has_feature(Target::SVE2)) { + if (!ramp && target_vscale() == 0) { CodeGen_Posix::visit(op); return; } @@ -1780,7 +1925,7 @@ void CodeGen_ARM::visit(const Load *op) { // If the stride is in [-1, 1], we can deal with that using vanilla codegen const IntImm *stride = ramp ? ramp->stride.as() : nullptr; if (stride && (-1 <= stride->value && stride->value <= 1) && - !target.has_feature(Target::SVE2)) { + target_vscale() == 0) { CodeGen_Posix::visit(op); return; } @@ -1806,12 +1951,12 @@ void CodeGen_ARM::visit(const Load *op) { } } - if (target.has_feature(Target::SVE2)) { + if ((target_vscale() > 0)) { if (stride && stride->value < 1) { CodeGen_Posix::visit(op); return; } else if (stride && stride->value == 1) { - const int natural_lanes = target.natural_vector_size(op->type); + const int natural_lanes = natural_vector_size(op->type); if (ramp->lanes % natural_lanes) { // Load with lanes multiple of natural_lanes int aligned_lanes = align_up(ramp->lanes, natural_lanes); @@ -1849,7 +1994,7 @@ void CodeGen_ARM::visit(const Load *op) { const int index_bits = 32; Type type_with_max_bits = Int(std::max(elt.bits(), index_bits)); // The number of lanes is constrained by index vector type - const int natural_lanes = target.natural_vector_size(type_with_max_bits); + const int natural_lanes = natural_vector_size(type_with_max_bits); const int vscale_natural_lanes = natural_lanes / target_vscale(); Expr base = 0; @@ -2460,6 +2605,10 @@ void CodeGen_ARM::visit(const Call *op) { CodeGen_Posix::visit(op); } +Value *CodeGen_ARM::call_overloaded_intrin(const Type &result_type, const std::string &name, const std::vector &args) { + return CodeGen_Posix::call_overloaded_intrin(result_type, name, args, *effective_intrinsics); +} + void CodeGen_ARM::visit(const LT *op) { if (op->a.type().is_float() && op->type.is_vector()) { // Fast-math flags confuse LLVM's aarch64 backend, so @@ -2610,13 +2759,13 @@ bool CodeGen_ARM::codegen_pairwise_vector_reduce(const VectorReduce *op, const E narrow = lossless_cast(narrow_type.with_code(Type::UInt), op->value); } if (narrow.defined()) { - if (init.defined() && (target.bits == 32 || target.has_feature(Target::SVE2))) { + if (init.defined() && (target.bits == 32 || (target_vscale() > 0))) { // On 32-bit or SVE2, we have an intrinsic for widening add-accumulate. // TODO: this could be written as a pattern with widen_right_add (#6951). intrin = "pairwise_widening_add_accumulate"; intrin_args = {accumulator, narrow}; accumulator = Expr(); - } else if (target.has_feature(Target::SVE2)) { + } else if (target_vscale() > 0) { intrin = "pairwise_widening_add_accumulate"; intrin_args = {Expr(0), narrow}; accumulator = Expr(); @@ -2626,15 +2775,15 @@ bool CodeGen_ARM::codegen_pairwise_vector_reduce(const VectorReduce *op, const E intrin = "pairwise_widening_add"; intrin_args = {narrow}; } - } else if (!target.has_feature(Target::SVE2)) { + } else if (target_vscale() == 0) { // Exclude SVE, as it process lanes in different order (even/odd wise) than NEON intrin = "pairwise_add"; intrin_args = {op->value}; } - } else if (op->op == VectorReduce::Min && factor == 2 && !target.has_feature(Target::SVE2)) { + } else if (op->op == VectorReduce::Min && factor == 2 && target_vscale() == 0) { intrin = "pairwise_min"; intrin_args = {op->value}; - } else if (op->op == VectorReduce::Max && factor == 2 && !target.has_feature(Target::SVE2)) { + } else if (op->op == VectorReduce::Max && factor == 2 && target_vscale() == 0) { intrin = "pairwise_max"; intrin_args = {op->value}; } @@ -2685,7 +2834,7 @@ bool CodeGen_ARM::codegen_across_vector_reduce(const VectorReduce *op, const Exp Expr val = op->value; const int output_lanes = op->type.lanes(); - const int native_lanes = target.natural_vector_size(op->type); + const int native_lanes = natural_vector_size(op->type); const int input_lanes = val.type().lanes(); if (output_lanes != 1 || input_lanes < 2) { @@ -2871,16 +3020,6 @@ int CodeGen_ARM::native_vector_bits() const { return std::max(target_vscale(), 1) * 128; } -int CodeGen_ARM::target_vscale() const { - // TODO: https://github.com/halide/Halide/issues/8872 - // if (target.features_any_of({Target::SVE, Target::SVE2})) { - if (target.has_feature(Target::SVE2)) { - return target.vector_bits / 128; - } - - return 0; -} - bool CodeGen_ARM::supports_call_as_float16(const Call *op) const { bool is_fp16_native = float16_native_funcs.find(op->name) != float16_native_funcs.end(); bool is_fp16_transcendental = float16_transcendental_remapping.find(op->name) != float16_transcendental_remapping.end(); diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp index 98837d27b0d4..a91b2c76953f 100644 --- a/src/CodeGen_LLVM.cpp +++ b/src/CodeGen_LLVM.cpp @@ -1395,6 +1395,10 @@ Type CodeGen_LLVM::upgrade_type_for_storage(const Type &t) const { } } +void CodeGen_LLVM::set_effective_vscale(int vscale) { + effective_vscale = vscale; +} + void CodeGen_LLVM::visit(const IntImm *op) { value = ConstantInt::getSigned(llvm_type_of(op->type), op->value); } @@ -4643,6 +4647,12 @@ void CodeGen_LLVM::declare_intrin_overload(const std::string &name, const Type & } Value *CodeGen_LLVM::call_overloaded_intrin(const Type &result_type, const std::string &name, const std::vector &args) { + return call_overloaded_intrin(result_type, name, args, intrinsics); +} + +Value *CodeGen_LLVM::call_overloaded_intrin(const Type &result_type, const std::string &name, const std::vector &args, + const IntrinsicsMap &overloaded_intrinsics) { + constexpr int debug_level = 4; debug(debug_level) << "call_overloaded_intrin: " << result_type << " " << name << "("; @@ -4653,8 +4663,8 @@ Value *CodeGen_LLVM::call_overloaded_intrin(const Type &result_type, const std:: } debug(debug_level) << ")\n"; - auto impls_i = intrinsics.find(name); - if (impls_i == intrinsics.end()) { + const auto impls_i = overloaded_intrinsics.find(name); + if (impls_i == overloaded_intrinsics.end()) { debug(debug_level) << "No intrinsic " << name << "\n"; return nullptr; } diff --git a/src/CodeGen_LLVM.h b/src/CodeGen_LLVM.h index 240114977f82..68c3672d90fd 100644 --- a/src/CodeGen_LLVM.h +++ b/src/CodeGen_LLVM.h @@ -164,6 +164,8 @@ class CodeGen_LLVM : public IRVisitor { * of functions as. */ virtual Type upgrade_type_for_argument_passing(const Type &) const; + void set_effective_vscale(int vscale); + std::unique_ptr module; llvm::Function *function = nullptr; llvm::LLVMContext *context = nullptr; @@ -474,8 +476,9 @@ class CodeGen_LLVM : public IRVisitor { : result_type(result_type), arg_types(std::move(arg_types)), impl(impl) { } }; + using IntrinsicsMap = std::map>; /** Mapping of intrinsic functions to the various overloads implementing it. */ - std::map> intrinsics; + IntrinsicsMap intrinsics; /** Get an LLVM intrinsic declaration. If it doesn't exist, it will be created. */ llvm::Function *get_llvm_intrin(const Type &ret_type, const std::string &name, const std::vector &arg_types, bool scalars_are_vectors = false); @@ -484,7 +487,11 @@ class CodeGen_LLVM : public IRVisitor { llvm::Function *declare_intrin_overload(const std::string &name, const Type &ret_type, const std::string &impl_name, std::vector arg_types, bool scalars_are_vectors = false); void declare_intrin_overload(const std::string &name, const Type &ret_type, llvm::Function *impl, std::vector arg_types); /** Call an overloaded intrinsic function. Returns nullptr if no suitable overload is found. */ - llvm::Value *call_overloaded_intrin(const Type &result_type, const std::string &name, const std::vector &args); + virtual llvm::Value *call_overloaded_intrin(const Type &result_type, const std::string &name, const std::vector &args); + /** Call an overloaded intrinsic function. Returns nullptr if no suitable overload is found. + * Look up the given overloaded_intrinsics map for the corresponding intrin */ + llvm::Value *call_overloaded_intrin(const Type &result_type, const std::string &name, const std::vector &args, + const IntrinsicsMap &overloaded_intrinsics); /** Generate a call to a vector intrinsic or runtime inlined * function. The arguments are sliced up into vectors of the width diff --git a/test/correctness/CMakeLists.txt b/test/correctness/CMakeLists.txt index a25b077a1abb..186dda419e37 100644 --- a/test/correctness/CMakeLists.txt +++ b/test/correctness/CMakeLists.txt @@ -110,6 +110,7 @@ tests(GROUPS correctness extern_stage_on_device.cpp extract_concat_bits.cpp failed_unroll.cpp + fallback_vscale_sve.cpp fast_trigonometric.cpp fibonacci.cpp fit_function.cpp diff --git a/test/correctness/fallback_vscale_sve.cpp b/test/correctness/fallback_vscale_sve.cpp new file mode 100644 index 000000000000..e8110e910339 --- /dev/null +++ b/test/correctness/fallback_vscale_sve.cpp @@ -0,0 +1,83 @@ +#include "Halide.h" +#include +#include + +using namespace Halide; + +bool compile_and_check_vscale(Func &f, + const std::string &name, + const Target &t, + int exp_vscale, + const std::string &exp_intrin) { + + // Look into llvm-ir and check function attributes for vscale_range + auto llvm_file_name = name + ".ll"; + f.compile_to_llvm_assembly(llvm_file_name, f.infer_arguments(), t); + + Internal::assert_file_exists(llvm_file_name); + std::ifstream llvm_file; + llvm_file.open(llvm_file_name); + std::string line; + // Pattern to extract "n" and "m" in "vscale_range(n,m)" + std::regex vscale_regex(R"(vscale_range\(\s*([0-9]+)\s*,\s*([0-9]+)\s*\))"); + + int act_vscale = 0; + bool intrin_found = false; + + while (getline(llvm_file, line)) { + // Check vscale_range + std::smatch match; + if (std::regex_search(line, match, vscale_regex) && match[1] == match[2]) { + act_vscale = std::stoi(match[1]); + } + // Check intrin + if (line.find(exp_intrin) != std::string::npos) { + intrin_found = true; + } + } + + if (act_vscale != exp_vscale) { + printf("[%s] Found vscale_range %d, while expected %d\n", name.c_str(), act_vscale, exp_vscale); + return false; + } + if (!intrin_found) { + printf("[%s] Cannot find expected intrin %s\n", name.c_str(), exp_intrin.c_str()); + return false; + } + return true; +} + +Var x("x"), y("y"); + +bool test_vscale(int vectorization_factor, int vector_bits, int exp_vscale) { + Func f("f"); + f(x, y) = absd(x, y); + f.compute_root().vectorize(x, vectorization_factor); + + Target t("arm-64-linux-sve2-no_asserts-no_runtime-no_bounds_query"); + t.vector_bits = vector_bits; + + std::stringstream name; + name << "test_vscale_v" << vectorization_factor << "_vector_bits_" << vector_bits; + + // sve or neon + std::string intrin = exp_vscale > 0 ? "llvm.aarch64.sve.sabd" : "llvm.aarch64.neon.sabd"; + + return compile_and_check_vscale(f, name.str(), t, exp_vscale, intrin); +} + +int main(int argc, char **argv) { + + bool ok = true; + + ok &= test_vscale(4, 128, 1); // Regular case: with vscale=1 + ok &= test_vscale(3, 128, 0); // Fallback due to odd vectorization factor + ok &= test_vscale(8, 512, 4); // Regular case: with vscale=4 + ok &= test_vscale(4, 512, 0); // Fallback due to + + if (!ok) { + return 1; + } + printf("Success!\n"); + return 0; +} diff --git a/test/warning/CMakeLists.txt b/test/warning/CMakeLists.txt index ffaa35b367c8..248c0d64261a 100644 --- a/test/warning/CMakeLists.txt +++ b/test/warning/CMakeLists.txt @@ -4,6 +4,7 @@ tests(GROUPS warning require_const_false.cpp sliding_vectors.cpp unscheduled_update_def.cpp + unsupported_vectorization_sve.cpp emulated_float16.cpp ) diff --git a/test/warning/unsupported_vectorization_sve.cpp b/test/warning/unsupported_vectorization_sve.cpp new file mode 100644 index 000000000000..3fa9c2602804 --- /dev/null +++ b/test/warning/unsupported_vectorization_sve.cpp @@ -0,0 +1,23 @@ +#include "Halide.h" +#include "halide_test_dirs.h" + +using namespace Halide; + +int main(int argc, char **argv) { + Func f; + Var x; + + f(x) = x * 0.1f; + + constexpr int vscale = 2; + constexpr int vector_bits = 128 * vscale; + + f.vectorize(x, vscale * 3); + Target t("arm-64-linux-sve2-vector_bits_" + std::to_string(vector_bits)); + + // SVE is disabled with user_warning, + // which would have ended up with emitting if we didn't. + f.compile_to_llvm_assembly(Internal::get_test_tmp_dir() + "unused.ll", f.infer_arguments(), "f", t); + + return 0; +}