diff --git a/src/CodeGen_ARM.cpp b/src/CodeGen_ARM.cpp index abfff62a431a..4de922a9f93c 100644 --- a/src/CodeGen_ARM.cpp +++ b/src/CodeGen_ARM.cpp @@ -14,6 +14,7 @@ #include "IROperator.h" #include "IRPrinter.h" #include "LLVM_Headers.h" +#include "OptimizeShuffles.h" #include "Simplify.h" #include "Substitute.h" #include "Util.h" @@ -227,6 +228,7 @@ class CodeGen_ARM : public CodeGen_Posix { Value *interleave_vectors(const std::vector &) override; Value *shuffle_vectors(Value *a, Value *b, const std::vector &indices) override; Value *shuffle_scalable_vectors_general(Value *a, Value *b, const std::vector &indices); + Value *shuffle_scalable_vectors_general_llvm(Value *a, Value *b, Value *indices, int min_index, int max_index); Value *codegen_shuffle_indices(int bits, const std::vector &indices); Value *codegen_whilelt(int total_lanes, int start, int end); void codegen_vector_reduce(const VectorReduce *, const Expr &) override; @@ -1223,6 +1225,22 @@ void CodeGen_ARM::compile_func(const LoweredFunc &f, // and a - (b << c) into umlsl/smlsl. func.body = distribute_shifts(func.body, /* multiply_adds */ true); + if (target_vscale() > 0) { + debug(1) << "ARM: Optimizing shuffles...\n"; + const int lut_alignment = 16; + + auto max_span_query = [&](const Type &lut_type) -> std::vector { + int vl = natural_vector_size(lut_type); + // SVE2 has TBL and TBL2 (TBL with two src vectors) LLVM intrinsic. + // We prioritize TBL with single src vector in favor of performance. + return {vl, vl * 2}; + }; + + func.body = optimize_shuffles(func.body, lut_alignment, native_vector_bits(), max_span_query, true); + debug(2) << "ARM: Lowering after optimizing shuffles:\n" + << func.body << "\n\n"; + } + CodeGen_Posix::compile_func(func, simple_name, extern_name); } @@ -2250,7 +2268,7 @@ Value *CodeGen_ARM::shuffle_vectors(Value *a, Value *b, const std::vector & } // Perform vector shuffle by decomposing the operation to multiple native shuffle steps - // which calls shuffle_scalable_vectors_general() which emits TBL/TBL2 instruction + // which calls shuffle_scalable_vectors_general() which emits TBL/TBL2 LLVM intrinsic. DecomposeVectorShuffle shuffler(*this, a, b, get_vector_num_elements(a->getType()), natural_lanes); return shuffler.run(indices); } @@ -2259,11 +2277,29 @@ Value *CodeGen_ARM::shuffle_scalable_vectors_general(Value *a, Value *b, const s internal_assert(a) << "Must provide a valid vector operand"; internal_assert(!indices.empty()) << "Cannot shuffle with empty indices"; + llvm::Type *elt = get_vector_element_type(a->getType()); + Value *val_indices = codegen_shuffle_indices(elt->getScalarSizeInBits(), indices); + auto [min_itr, max_itr] = std::minmax_element(indices.begin(), indices.end()); + int highest_lane = *max_itr; + internal_assert(highest_lane >= 0) + << "highest_lane was " + << (highest_lane == SliceIndexNone ? "SliceIndexNone" : + highest_lane == SliceIndexCarryPrevResult ? "SliceIndexCarryPrevResult" : + "") + << " (" << highest_lane << ")"; + + return shuffle_scalable_vectors_general_llvm(a, b, val_indices, *min_itr, *max_itr); +} + +Value *CodeGen_ARM::shuffle_scalable_vectors_general_llvm(Value *a, Value *b, Value *indices, int min_index, int max_index) { + internal_assert(a) << "Must provide a valid vector operand"; + internal_assert(indices) << "Must provide a valid indices"; + llvm::Type *elt = get_vector_element_type(a->getType()); const int bits = elt->getScalarSizeInBits(); const int natural_lanes = natural_vector_size(Int(bits)); const int src_lanes = get_vector_num_elements(a->getType()); - const int dst_lanes = indices.size(); + const int dst_lanes = get_vector_num_elements(indices->getType()); llvm::Type *dst_type = get_vector_type(elt, dst_lanes); internal_assert(target_vscale() > 0 && is_scalable_vector(a)) << "Only deal with scalable vectors\n"; @@ -2271,29 +2307,20 @@ Value *CodeGen_ARM::shuffle_scalable_vectors_general(Value *a, Value *b, const s << "Only deal with vector with natural_lanes\n"; // We select TBL or TBL2 intrinsic depending on indices range - int highest_lane = *std::max_element(indices.begin(), indices.end()); - internal_assert(highest_lane >= 0) - << "highest_lane was " - << (highest_lane == SliceIndexNone ? "SliceIndexNone" : - highest_lane == SliceIndexCarryPrevResult ? "SliceIndexCarryPrevResult" : - "") - << " (" << highest_lane << ")"; - - bool use_tbl = highest_lane < src_lanes; + const bool use_tbl = max_index < src_lanes; internal_assert(use_tbl || b) << "'b' must be valid in case of tbl2\n"; auto instr = concat_strings("llvm.aarch64.sve.", use_tbl ? "tbl" : "tbl2", mangle_llvm_type(dst_type)); - Value *val_indices = codegen_shuffle_indices(bits, indices); llvm::Type *vt_natural = get_vector_type(elt, natural_lanes); std::vector llvm_arg_types; std::vector llvm_arg_vals; if (use_tbl) { - llvm_arg_types = {vt_natural, val_indices->getType()}; - llvm_arg_vals = {a, val_indices}; + llvm_arg_types = {vt_natural, indices->getType()}; + llvm_arg_vals = {a, indices}; } else { - llvm_arg_types = {vt_natural, vt_natural, val_indices->getType()}; - llvm_arg_vals = {a, b, val_indices}; + llvm_arg_types = {vt_natural, vt_natural, indices->getType()}; + llvm_arg_vals = {a, b, indices}; } llvm::FunctionType *fn_type = FunctionType::get(vt_natural, llvm_arg_types, false); FunctionCallee fn = module->getOrInsertFunction(instr, fn_type); @@ -2383,6 +2410,41 @@ void CodeGen_ARM::visit(const Call *op) { value = codegen(lower_round_to_nearest_ties_to_even(op->args[0])); return; } + } else if (op->is_intrinsic(Call::dynamic_shuffle)) { + internal_assert(target_vscale() > 0); + internal_assert(op->args.size() == 4); + const auto min_index = as_const_int(op->args[2]); + const auto max_index = as_const_int(op->args[3]); + internal_assert(min_index.has_value() && max_index.has_value()); + + Type lut_type = op->args[0].type(); + const int src_lanes = lut_type.lanes(); + const int dst_lanes = op->args[1].type().lanes(); + const int natural_lanes = natural_vector_size(lut_type); + + debug(3) << "dynamic_shuffle: [" << *min_index << ", " << *max_index << "]" + << ", natural_lanes:" << natural_lanes << ", src_lanes:" << src_lanes << "\n"; + + Value *src = codegen(op->args[0]); + internal_assert(src_lanes <= natural_lanes * 2) << "src is too long to dynamic_shuffle\n"; + Value *src_a = slice_vector(src, 0, natural_lanes); + Value *src_b = (src_lanes > natural_lanes) ? slice_vector(src, natural_lanes, natural_lanes) : nullptr; + + // Cast index to integer with the same bits as LUT data + Type index_type = UInt(lut_type.bits()).with_lanes(dst_lanes); + Expr indices = cast(index_type, op->args[1]); + Value *val_indices = codegen(indices); + + std::vector slices; + const int num_slices = align_up(dst_lanes, natural_lanes) / natural_lanes; + slices.reserve(num_slices); + for (int i = 0; i < num_slices; i++) { + Value *indices_slice = slice_vector(val_indices, i * natural_lanes, natural_lanes); + Value *dst_slice = shuffle_scalable_vectors_general_llvm(src_a, src_b, indices_slice, *min_index, *max_index); + slices.push_back(dst_slice); + } + value = slice_vector(concat_vectors(slices), 0, dst_lanes); + return; } if (op->type.is_vector()) { diff --git a/src/CodeGen_Hexagon.cpp b/src/CodeGen_Hexagon.cpp index 25f7e11aa885..5405628e1825 100644 --- a/src/CodeGen_Hexagon.cpp +++ b/src/CodeGen_Hexagon.cpp @@ -1935,7 +1935,9 @@ void CodeGen_Hexagon::visit(const Call *op) { auto max_index = as_const_int(op->args[3]); internal_assert(min_index && max_index); Value *lut = codegen(op->args[0]); - Value *idx = codegen(op->args[1]); + // Cast the index to 8 bit + Expr index = cast(UInt(8).with_lanes(op->type.lanes()), op->args[1]); + Value *idx = codegen(index); value = vlut(lut, idx, *min_index, *max_index); return; } else if (op->is_intrinsic(Call::abs)) { diff --git a/src/HexagonOptimize.cpp b/src/HexagonOptimize.cpp index 745dd9a6e808..13a9f65dd0bf 100644 --- a/src/HexagonOptimize.cpp +++ b/src/HexagonOptimize.cpp @@ -2285,7 +2285,8 @@ class SyncronizationBarriers : public IRMutator { Stmt optimize_hexagon_shuffles(const Stmt &s, int lut_alignment) { // Replace indirect and other complicated loads with // dynamic_shuffle (vlut) calls. - return optimize_shuffles(s, lut_alignment); + auto max_span_query = [](const Type &t) -> std::vector { return {256}; }; + return optimize_shuffles(s, lut_alignment, 1024, max_span_query, false); } Stmt scatter_gather_generator(Stmt s) { diff --git a/src/OptimizeShuffles.cpp b/src/OptimizeShuffles.cpp index 83672fa59395..902563d433dd 100644 --- a/src/OptimizeShuffles.cpp +++ b/src/OptimizeShuffles.cpp @@ -21,8 +21,13 @@ namespace Internal { namespace { +using SpanQueryType = std::function(const Type &)>; + class OptimizeShuffles : public IRMutator { int lut_alignment; + int native_vector_bits; + SpanQueryType get_max_span_sizes; + bool align_loads_with_native_vector; Scope bounds; std::vector> lets; @@ -67,7 +72,7 @@ class OptimizeShuffles : public IRMutator { if (allocations_to_pad.count(op->name)) { op = s.as(); internal_assert(op); - int padding = 128 / op->type.bytes(); // One native vector + int padding = native_vector_bits / op->type.bits(); // One native vector return Allocate::make(op->name, op->type, op->memory_type, op->extents, op->condition, op->body, op->new_expr, op->free_function, @@ -99,34 +104,40 @@ class OptimizeShuffles : public IRMutator { ((unaligned_index_bounds.max + align) / align) * align - 1}; ModulusRemainder alignment(align, 0); - for (const Interval &index_bounds : {aligned_index_bounds, unaligned_index_bounds}) { - Expr index_span = span_of_bounds(index_bounds); - index_span = common_subexpression_elimination(index_span); - index_span = simplify(index_span); - - if (can_prove(index_span < 256)) { - // This is a lookup within an up to 256 element array. We - // can use dynamic_shuffle for this. - int const_extent = as_const_int(index_span) ? *as_const_int(index_span) + 1 : 256; - Expr base = simplify(index_bounds.min); - - // Load all of the possible indices loaded from the - // LUT. Note that for clamped ramps, this loads up to 1 - // vector past the max, so we will add padding to the - // allocation accordingly (if we're the one that made it). - allocations_to_pad.insert(op->name); - Expr lut = Load::make(op->type.with_lanes(const_extent), op->name, - Ramp::make(base, 1, const_extent), - op->image, op->param, const_true(const_extent), alignment); - - // We know the size of the LUT is not more than 256, so we - // can safely cast the index to 8 bit, which - // dynamic_shuffle requires. - index = simplify(cast(UInt(8).with_lanes(op->type.lanes()), index - base)); - return Call::make(op->type, "dynamic_shuffle", {lut, index, 0, const_extent - 1}, Call::PureIntrinsic); + const int native_vector_size = native_vector_bits / op->type.bits(); + + for (const auto &max_span_size : get_max_span_sizes(op->type)) { + + for (const Interval &index_bounds : {aligned_index_bounds, unaligned_index_bounds}) { + Expr index_span = span_of_bounds(index_bounds); + index_span = common_subexpression_elimination(index_span); + index_span = simplify(index_span); + + if (can_prove(index_span < max_span_size)) { + // This is a lookup within an up to max_span_size element array. We + // can use dynamic_shuffle for this. + int const_extent = as_const_int(index_span) ? *as_const_int(index_span) + 1 : max_span_size; + if (align_loads_with_native_vector) { + const_extent = align_up(const_extent, native_vector_size); + } + Expr base = simplify(index_bounds.min); + + // Load all of the possible indices loaded from the + // LUT. Note that for clamped ramps, this loads up to 1 + // vector past the max, so we will add padding to the + // allocation accordingly (if we're the one that made it). + allocations_to_pad.insert(op->name); + Expr lut = Load::make(op->type.with_lanes(const_extent), op->name, + Ramp::make(base, 1, const_extent), + op->image, op->param, const_true(const_extent), alignment); + + // Target dependent codegen needs to cast the type of index to what it accepts + index = simplify(index - base); + return Call::make(op->type, "dynamic_shuffle", {lut, index, 0, const_extent - 1}, Call::PureIntrinsic); + } + // Only the first iteration of this loop is aligned. + alignment = ModulusRemainder(); } - // Only the first iteration of this loop is aligned. - alignment = ModulusRemainder(); } } if (!index.same_as(op->index)) { @@ -137,14 +148,17 @@ class OptimizeShuffles : public IRMutator { } public: - OptimizeShuffles(int lut_alignment) - : lut_alignment(lut_alignment) { + OptimizeShuffles(int lut_alignment, int native_vector_bits, SpanQueryType get_max_span_sizes, bool align_loads_with_native_vector) + : lut_alignment(lut_alignment), + native_vector_bits(native_vector_bits), + get_max_span_sizes(std::move(get_max_span_sizes)), + align_loads_with_native_vector(align_loads_with_native_vector) { } }; } // namespace -Stmt optimize_shuffles(Stmt s, int lut_alignment) { - s = OptimizeShuffles(lut_alignment)(s); +Stmt optimize_shuffles(Stmt s, int lut_alignment, int native_vector_bits, SpanQueryType get_max_span_sizes, bool align_loads_with_native_vector) { + s = OptimizeShuffles(lut_alignment, native_vector_bits, std::move(get_max_span_sizes), align_loads_with_native_vector)(s); return s; } diff --git a/src/OptimizeShuffles.h b/src/OptimizeShuffles.h index 3d57e4c22ce1..7cb51bbf225b 100644 --- a/src/OptimizeShuffles.h +++ b/src/OptimizeShuffles.h @@ -7,13 +7,19 @@ */ #include "Expr.h" +#include +#include namespace Halide { namespace Internal { /* Replace indirect loads with dynamic_shuffle intrinsics where possible. */ -Stmt optimize_shuffles(Stmt s, int lut_alignment); +Stmt optimize_shuffles(Stmt s, + int lut_alignment, + int native_vector_bits, + std::function(const Type &)> get_max_span_sizes, + bool align_loads_with_native_vector); } // namespace Internal } // namespace Halide diff --git a/test/correctness/simd_op_check_sve2.cpp b/test/correctness/simd_op_check_sve2.cpp index 8118334666a6..f12899a8568f 100644 --- a/test/correctness/simd_op_check_sve2.cpp +++ b/test/correctness/simd_op_check_sve2.cpp @@ -855,7 +855,7 @@ class SimdOpCheckArmSve : public SimdOpCheckTest { if (instr_lanes < 2 || (total_lanes / vscale < 2)) continue; // bail out scalar and AddTestFunctor add(*this, bits, total_lanes); - Expr index = clamp(cast(in_im(x)), 0, W - 1); + Expr index = clamp(in_i32(x), 0, W - 1); Func tmp; tmp(x, y) = cast(elt, y); tmp(x, index) = cast(elt, 1); @@ -876,6 +876,38 @@ class SimdOpCheckArmSve : public SimdOpCheckTest { } } } + + // Gather load where index range is bounded within certain value. e.g. LUT + // In this case, Halide tries to transform it into contiguous load + Call::dynamic_shuffle + // which is lowered to TBL instruction. (see OptimizeShuffles.cpp) + if (has_sve()) { + const int width = base_vec_bits; + const int total_lanes = width / bits; + const int instr_lanes = Instruction::get_instr_lanes(bits, total_lanes, target); + if (instr_lanes < 2 || (total_lanes / vscale < 2)) continue; // bail out scalar and + + AddTestFunctor add(*this, bits, total_lanes); + const std::vector> index_min_max{ + {0, total_lanes - 1}, + {1, total_lanes}, + {0, total_lanes * 2 - 1}, + }; + for (auto &[index_min, index_max] : index_min_max) { + Expr index = cast(Int(32), in_im(x)); + index = clamp(index, index_min, index_max); + Expr look_up = in_im(index); + + add("tbl", look_up); + } + + // Without clamped but bounded by the range of the data type of the input image (8bit) + Expr index = cast(Int(32), in_u8(x)); // 8 bit fixed + int factor = (1 << 8) / (total_lanes * 2); + index = index / factor; // index should be within native_vector*2 range + Expr look_up = in_im(index); + + add("tbl", look_up); + } } }