diff --git a/src/CodeGen_ARM.cpp b/src/CodeGen_ARM.cpp
index abfff62a431a..4de922a9f93c 100644
--- a/src/CodeGen_ARM.cpp
+++ b/src/CodeGen_ARM.cpp
@@ -14,6 +14,7 @@
 #include "IROperator.h"
 #include "IRPrinter.h"
 #include "LLVM_Headers.h"
+#include "OptimizeShuffles.h"
 #include "Simplify.h"
 #include "Substitute.h"
 #include "Util.h"
@@ -227,6 +228,7 @@ class CodeGen_ARM : public CodeGen_Posix {
     Value *interleave_vectors(const std::vector<Value *> &) override;
     Value *shuffle_vectors(Value *a, Value *b, const std::vector<int> &indices) override;
     Value *shuffle_scalable_vectors_general(Value *a, Value *b, const std::vector<int> &indices);
+    Value *shuffle_scalable_vectors_general_llvm(Value *a, Value *b, Value *indices, int min_index, int max_index);
     Value *codegen_shuffle_indices(int bits, const std::vector<int> &indices);
     Value *codegen_whilelt(int total_lanes, int start, int end);
     void codegen_vector_reduce(const VectorReduce *, const Expr &) override;
@@ -1223,6 +1225,22 @@ void CodeGen_ARM::compile_func(const LoweredFunc &f,
     // and a - (b << c) into umlsl/smlsl.
     func.body = distribute_shifts(func.body, /* multiply_adds */ true);
 
+    if (target_vscale() > 0) {
+        debug(1) << "ARM: Optimizing shuffles...\n";
+        const int lut_alignment = 16;
+
+        auto max_span_query = [&](const Type &lut_type) -> std::vector<int> {
+            int vl = natural_vector_size(lut_type);
+            // SVE2 has TBL and TBL2 (TBL with two src vectors) LLVM intrinsic.
+            // We prioritize TBL with single src vector in favor of performance.
+            return {vl, vl * 2};
+        };
+
+        func.body = optimize_shuffles(func.body, lut_alignment, native_vector_bits(), max_span_query, true);
+        debug(2) << "ARM: Lowering after optimizing shuffles:\n"
+                 << func.body << "\n\n";
+    }
+
     CodeGen_Posix::compile_func(func, simple_name, extern_name);
 }
 
@@ -2250,7 +2268,7 @@ Value *CodeGen_ARM::shuffle_vectors(Value *a, Value *b, const std::vector<int> &
     }
 
     // Perform vector shuffle by decomposing the operation to multiple native shuffle steps
-    // which calls shuffle_scalable_vectors_general() which emits TBL/TBL2 instruction
+    // which calls shuffle_scalable_vectors_general() which emits TBL/TBL2 LLVM intrinsic.
     DecomposeVectorShuffle shuffler(*this, a, b, get_vector_num_elements(a->getType()), natural_lanes);
     return shuffler.run(indices);
 }
@@ -2259,11 +2277,29 @@ Value *CodeGen_ARM::shuffle_scalable_vectors_general(Value *a, Value *b, const s
     internal_assert(a) << "Must provide a valid vector operand";
     internal_assert(!indices.empty()) << "Cannot shuffle with empty indices";
 
+    llvm::Type *elt = get_vector_element_type(a->getType());
+    Value *val_indices = codegen_shuffle_indices(elt->getScalarSizeInBits(), indices);
+    auto [min_itr, max_itr] = std::minmax_element(indices.begin(), indices.end());
+    int highest_lane = *max_itr;
+    internal_assert(highest_lane >= 0)
+        << "highest_lane was "
+        << (highest_lane == SliceIndexNone            ? "SliceIndexNone" :
+            highest_lane == SliceIndexCarryPrevResult ? "SliceIndexCarryPrevResult" :
+                                                        "")
+        << " (" << highest_lane << ")";
+
+    return shuffle_scalable_vectors_general_llvm(a, b, val_indices, *min_itr, *max_itr);
+}
+
+Value *CodeGen_ARM::shuffle_scalable_vectors_general_llvm(Value *a, Value *b, Value *indices, int min_index, int max_index) {
+    internal_assert(a) << "Must provide a valid vector operand";
+    internal_assert(indices) << "Must provide a valid indices";
+
     llvm::Type *elt = get_vector_element_type(a->getType());
     const int bits = elt->getScalarSizeInBits();
     const int natural_lanes = natural_vector_size(Int(bits));
     const int src_lanes = get_vector_num_elements(a->getType());
-    const int dst_lanes = indices.size();
+    const int dst_lanes = get_vector_num_elements(indices->getType());
     llvm::Type *dst_type = get_vector_type(elt, dst_lanes);
 
     internal_assert(target_vscale() > 0 && is_scalable_vector(a)) << "Only deal with scalable vectors\n";
@@ -2271,29 +2307,20 @@ Value *CodeGen_ARM::shuffle_scalable_vectors_general(Value *a, Value *b, const s
         << "Only deal with vector with natural_lanes\n";
 
     // We select TBL or TBL2 intrinsic depending on indices range
-    int highest_lane = *std::max_element(indices.begin(), indices.end());
-    internal_assert(highest_lane >= 0)
-        << "highest_lane was "
-        << (highest_lane == SliceIndexNone            ? "SliceIndexNone" :
-            highest_lane == SliceIndexCarryPrevResult ? "SliceIndexCarryPrevResult" :
-                                                        "")
-        << " (" << highest_lane << ")";
-
-    bool use_tbl = highest_lane < src_lanes;
+    const bool use_tbl = max_index < src_lanes;
     internal_assert(use_tbl || b) << "'b' must be valid in case of tbl2\n";
 
     auto instr = concat_strings("llvm.aarch64.sve.", use_tbl ? "tbl" : "tbl2", mangle_llvm_type(dst_type));
 
-    Value *val_indices = codegen_shuffle_indices(bits, indices);
     llvm::Type *vt_natural = get_vector_type(elt, natural_lanes);
     std::vector<llvm::Type *> llvm_arg_types;
     std::vector<llvm::Value *> llvm_arg_vals;
     if (use_tbl) {
-        llvm_arg_types = {vt_natural, val_indices->getType()};
-        llvm_arg_vals = {a, val_indices};
+        llvm_arg_types = {vt_natural, indices->getType()};
+        llvm_arg_vals = {a, indices};
     } else {
-        llvm_arg_types = {vt_natural, vt_natural, val_indices->getType()};
-        llvm_arg_vals = {a, b, val_indices};
+        llvm_arg_types = {vt_natural, vt_natural, indices->getType()};
+        llvm_arg_vals = {a, b, indices};
     }
     llvm::FunctionType *fn_type = FunctionType::get(vt_natural, llvm_arg_types, false);
     FunctionCallee fn = module->getOrInsertFunction(instr, fn_type);
@@ -2383,6 +2410,41 @@ void CodeGen_ARM::visit(const Call *op) {
             value = codegen(lower_round_to_nearest_ties_to_even(op->args[0]));
             return;
         }
+    } else if (op->is_intrinsic(Call::dynamic_shuffle)) {
+        internal_assert(target_vscale() > 0);
+        internal_assert(op->args.size() == 4);
+        const auto min_index = as_const_int(op->args[2]);
+        const auto max_index = as_const_int(op->args[3]);
+        internal_assert(min_index.has_value() && max_index.has_value());
+
+        Type lut_type = op->args[0].type();
+        const int src_lanes = lut_type.lanes();
+        const int dst_lanes = op->args[1].type().lanes();
+        const int natural_lanes = natural_vector_size(lut_type);
+
+        debug(3) << "dynamic_shuffle: [" << *min_index << ", " << *max_index << "]"
+                 << ", natural_lanes:" << natural_lanes << ", src_lanes:" << src_lanes << "\n";
+
+        Value *src = codegen(op->args[0]);
+        internal_assert(src_lanes <= natural_lanes * 2) << "src is too long to dynamic_shuffle\n";
+        Value *src_a = slice_vector(src, 0, natural_lanes);
+        Value *src_b = (src_lanes > natural_lanes) ? slice_vector(src, natural_lanes, natural_lanes) : nullptr;
+
+        // Cast index to integer with the same bits as LUT data
+        Type index_type = UInt(lut_type.bits()).with_lanes(dst_lanes);
+        Expr indices = cast(index_type, op->args[1]);
+        Value *val_indices = codegen(indices);
+
+        std::vector<Value *> slices;
+        const int num_slices = align_up(dst_lanes, natural_lanes) / natural_lanes;
+        slices.reserve(num_slices);
+        for (int i = 0; i < num_slices; i++) {
+            Value *indices_slice = slice_vector(val_indices, i * natural_lanes, natural_lanes);
+            Value *dst_slice = shuffle_scalable_vectors_general_llvm(src_a, src_b, indices_slice, *min_index, *max_index);
+            slices.push_back(dst_slice);
+        }
+        value = slice_vector(concat_vectors(slices), 0, dst_lanes);
+        return;
     }
 
     if (op->type.is_vector()) {
diff --git a/src/CodeGen_Hexagon.cpp b/src/CodeGen_Hexagon.cpp
index 25f7e11aa885..5405628e1825 100644
--- a/src/CodeGen_Hexagon.cpp
+++ b/src/CodeGen_Hexagon.cpp
@@ -1935,7 +1935,9 @@ void CodeGen_Hexagon::visit(const Call *op) {
             auto max_index = as_const_int(op->args[3]);
             internal_assert(min_index && max_index);
             Value *lut = codegen(op->args[0]);
-            Value *idx = codegen(op->args[1]);
+            // Cast the index to 8 bit
+            Expr index = cast(UInt(8).with_lanes(op->type.lanes()), op->args[1]);
+            Value *idx = codegen(index);
             value = vlut(lut, idx, *min_index, *max_index);
             return;
         } else if (op->is_intrinsic(Call::abs)) {
diff --git a/src/HexagonOptimize.cpp b/src/HexagonOptimize.cpp
index 745dd9a6e808..13a9f65dd0bf 100644
--- a/src/HexagonOptimize.cpp
+++ b/src/HexagonOptimize.cpp
@@ -2285,7 +2285,8 @@ class SyncronizationBarriers : public IRMutator {
 Stmt optimize_hexagon_shuffles(const Stmt &s, int lut_alignment) {
     // Replace indirect and other complicated loads with
     // dynamic_shuffle (vlut) calls.
-    return optimize_shuffles(s, lut_alignment);
+    auto max_span_query = [](const Type &t) -> std::vector<int> { return {256}; };
+    return optimize_shuffles(s, lut_alignment, 1024, max_span_query, false);
 }
 
 Stmt scatter_gather_generator(Stmt s) {
diff --git a/src/OptimizeShuffles.cpp b/src/OptimizeShuffles.cpp
index 83672fa59395..902563d433dd 100644
--- a/src/OptimizeShuffles.cpp
+++ b/src/OptimizeShuffles.cpp
@@ -21,8 +21,13 @@ namespace Internal {
 
 namespace {
 
+using SpanQueryType = std::function<std::vector<int>(const Type &)>;
+
 class OptimizeShuffles : public IRMutator {
     int lut_alignment;
+    int native_vector_bits;
+    SpanQueryType get_max_span_sizes;
+    bool align_loads_with_native_vector;
     Scope<Interval> bounds;
     std::vector<std::pair<std::string, Expr>> lets;
 
@@ -67,7 +72,7 @@ class OptimizeShuffles : public IRMutator {
         if (allocations_to_pad.count(op->name)) {
             op = s.as<Allocate>();
             internal_assert(op);
-            int padding = 128 / op->type.bytes();  // One native vector
+            int padding = native_vector_bits / op->type.bits();  // One native vector
             return Allocate::make(op->name, op->type, op->memory_type,
                                   op->extents, op->condition,
                                   op->body, op->new_expr, op->free_function,
@@ -99,34 +104,40 @@ class OptimizeShuffles : public IRMutator {
                 ((unaligned_index_bounds.max + align) / align) * align - 1};
             ModulusRemainder alignment(align, 0);
 
-            for (const Interval &index_bounds : {aligned_index_bounds, unaligned_index_bounds}) {
-                Expr index_span = span_of_bounds(index_bounds);
-                index_span = common_subexpression_elimination(index_span);
-                index_span = simplify(index_span);
-
-                if (can_prove(index_span < 256)) {
-                    // This is a lookup within an up to 256 element array. We
-                    // can use dynamic_shuffle for this.
-                    int const_extent = as_const_int(index_span) ? *as_const_int(index_span) + 1 : 256;
-                    Expr base = simplify(index_bounds.min);
-
-                    // Load all of the possible indices loaded from the
-                    // LUT. Note that for clamped ramps, this loads up to 1
-                    // vector past the max, so we will add padding to the
-                    // allocation accordingly (if we're the one that made it).
-                    allocations_to_pad.insert(op->name);
-                    Expr lut = Load::make(op->type.with_lanes(const_extent), op->name,
-                                          Ramp::make(base, 1, const_extent),
-                                          op->image, op->param, const_true(const_extent), alignment);
-
-                    // We know the size of the LUT is not more than 256, so we
-                    // can safely cast the index to 8 bit, which
-                    // dynamic_shuffle requires.
-                    index = simplify(cast(UInt(8).with_lanes(op->type.lanes()), index - base));
-                    return Call::make(op->type, "dynamic_shuffle", {lut, index, 0, const_extent - 1}, Call::PureIntrinsic);
+            const int native_vector_size = native_vector_bits / op->type.bits();
+
+            for (const auto &max_span_size : get_max_span_sizes(op->type)) {
+
+                for (const Interval &index_bounds : {aligned_index_bounds, unaligned_index_bounds}) {
+                    Expr index_span = span_of_bounds(index_bounds);
+                    index_span = common_subexpression_elimination(index_span);
+                    index_span = simplify(index_span);
+
+                    if (can_prove(index_span < max_span_size)) {
+                        // This is a lookup within an up to max_span_size element array. We
+                        // can use dynamic_shuffle for this.
+                        int const_extent = as_const_int(index_span) ? *as_const_int(index_span) + 1 : max_span_size;
+                        if (align_loads_with_native_vector) {
+                            const_extent = align_up(const_extent, native_vector_size);
+                        }
+                        Expr base = simplify(index_bounds.min);
+
+                        // Load all of the possible indices loaded from the
+                        // LUT. Note that for clamped ramps, this loads up to 1
+                        // vector past the max, so we will add padding to the
+                        // allocation accordingly (if we're the one that made it).
+                        allocations_to_pad.insert(op->name);
+                        Expr lut = Load::make(op->type.with_lanes(const_extent), op->name,
+                                              Ramp::make(base, 1, const_extent),
+                                              op->image, op->param, const_true(const_extent), alignment);
+
+                        // Target dependent codegen needs to cast the type of index to what it accepts
+                        index = simplify(index - base);
+                        return Call::make(op->type, "dynamic_shuffle", {lut, index, 0, const_extent - 1}, Call::PureIntrinsic);
+                    }
+                    // Only the first iteration of this loop is aligned.
+                    alignment = ModulusRemainder();
                 }
-                // Only the first iteration of this loop is aligned.
-                alignment = ModulusRemainder();
             }
         }
         if (!index.same_as(op->index)) {
@@ -137,14 +148,17 @@ class OptimizeShuffles : public IRMutator {
     }
 
 public:
-    OptimizeShuffles(int lut_alignment)
-        : lut_alignment(lut_alignment) {
+    OptimizeShuffles(int lut_alignment, int native_vector_bits, SpanQueryType get_max_span_sizes, bool align_loads_with_native_vector)
+        : lut_alignment(lut_alignment),
+          native_vector_bits(native_vector_bits),
+          get_max_span_sizes(std::move(get_max_span_sizes)),
+          align_loads_with_native_vector(align_loads_with_native_vector) {
     }
 };
 }  // namespace
 
-Stmt optimize_shuffles(Stmt s, int lut_alignment) {
-    s = OptimizeShuffles(lut_alignment)(s);
+Stmt optimize_shuffles(Stmt s, int lut_alignment, int native_vector_bits, SpanQueryType get_max_span_sizes, bool align_loads_with_native_vector) {
+    s = OptimizeShuffles(lut_alignment, native_vector_bits, std::move(get_max_span_sizes), align_loads_with_native_vector)(s);
     return s;
 }
 
diff --git a/src/OptimizeShuffles.h b/src/OptimizeShuffles.h
index 3d57e4c22ce1..7cb51bbf225b 100644
--- a/src/OptimizeShuffles.h
+++ b/src/OptimizeShuffles.h
@@ -7,13 +7,19 @@
  */
 
 #include "Expr.h"
+#include <functional>
+#include <vector>
 
 namespace Halide {
 namespace Internal {
 
 /* Replace indirect loads with dynamic_shuffle intrinsics where
 possible. */
-Stmt optimize_shuffles(Stmt s, int lut_alignment);
+Stmt optimize_shuffles(Stmt s,
+                       int lut_alignment,
+                       int native_vector_bits,
+                       std::function<std::vector<int>(const Type &)> get_max_span_sizes,
+                       bool align_loads_with_native_vector);
 
 }  // namespace Internal
 }  // namespace Halide
diff --git a/test/correctness/simd_op_check_sve2.cpp b/test/correctness/simd_op_check_sve2.cpp
index 8118334666a6..f12899a8568f 100644
--- a/test/correctness/simd_op_check_sve2.cpp
+++ b/test/correctness/simd_op_check_sve2.cpp
@@ -855,7 +855,7 @@ class SimdOpCheckArmSve : public SimdOpCheckTest {
                     if (instr_lanes < 2 || (total_lanes / vscale < 2)) continue;  // bail out scalar and <vscale x 1 x ty>
 
                     AddTestFunctor add(*this, bits, total_lanes);
-                    Expr index = clamp(cast<int>(in_im(x)), 0, W - 1);
+                    Expr index = clamp(in_i32(x), 0, W - 1);
                     Func tmp;
                     tmp(x, y) = cast(elt, y);
                     tmp(x, index) = cast(elt, 1);
@@ -876,6 +876,38 @@ class SimdOpCheckArmSve : public SimdOpCheckTest {
                     }
                 }
             }
+
+            // Gather load where index range is bounded within certain value. e.g. LUT
+            // In this case, Halide tries to transform it into contiguous load + Call::dynamic_shuffle
+            // which is lowered to TBL instruction. (see OptimizeShuffles.cpp)
+            if (has_sve()) {
+                const int width = base_vec_bits;
+                const int total_lanes = width / bits;
+                const int instr_lanes = Instruction::get_instr_lanes(bits, total_lanes, target);
+                if (instr_lanes < 2 || (total_lanes / vscale < 2)) continue;  // bail out scalar and <vscale x 1 x ty>
+
+                AddTestFunctor add(*this, bits, total_lanes);
+                const std::vector<std::pair<int, int>> index_min_max{
+                    {0, total_lanes - 1},
+                    {1, total_lanes},
+                    {0, total_lanes * 2 - 1},
+                };
+                for (auto &[index_min, index_max] : index_min_max) {
+                    Expr index = cast(Int(32), in_im(x));
+                    index = clamp(index, index_min, index_max);
+                    Expr look_up = in_im(index);
+
+                    add("tbl", look_up);
+                }
+
+                // Without clamped but bounded by the range of the data type of the input image (8bit)
+                Expr index = cast(Int(32), in_u8(x));  // 8 bit fixed
+                int factor = (1 << 8) / (total_lanes * 2);
+                index = index / factor;  // index should be within native_vector*2 range
+                Expr look_up = in_im(index);
+
+                add("tbl", look_up);
+            }
         }
     }