diff --git a/src/CodeGen_ARM.cpp b/src/CodeGen_ARM.cpp
index abfff62a431a..a9a80ac0ebad 100644
--- a/src/CodeGen_ARM.cpp
+++ b/src/CodeGen_ARM.cpp
@@ -1,3 +1,4 @@
+#include <map>
 #include <set>
 #include <sstream>
 
@@ -8,6 +9,7 @@
 #include "Debug.h"
 #include "DecomposeVectorShuffle.h"
 #include "DistributeShifts.h"
+#include "FindIntrinsics.h"
 #include "IREquality.h"
 #include "IRMatch.h"
 #include "IRMutator.h"
@@ -182,6 +184,7 @@ class CodeGen_ARM : public CodeGen_Posix {
     /** Similar to llvm_type_of, but allows providing a VectorTypeConstraint to
      * force Fixed or VScale vector results. */
     llvm::Type *llvm_type_with_constraint(const Type &t, bool scalars_are_vectors, VectorTypeConstraint constraint);
+    llvm::Type *llvm_type_with_constraint(const Type &t, bool scalars_are_vectors, VectorTypeConstraint constraint, int vscale);
 
     /** Define a wrapper LLVM func that takes some arguments which Halide defines
      * and call inner LLVM intrinsic with an additional argument which LLVM requires. */
@@ -190,12 +193,17 @@ class CodeGen_ARM : public CodeGen_Posix {
                                           const std::string &mangled_name,
                                           const std::vector<Type> &arg_types,
                                           int intrinsic_flags,
-                                          bool sve_intrinsic);
+                                          bool sve_intrinsic,
+                                          int vscale);
 
     void init_module() override;
     void compile_func(const LoweredFunc &f,
                       const std::string &simple_name, const std::string &extern_name) override;
 
+    /** Determine feasible vscale (vector_bits/128 or 0) by checking vector lanes used in the function.
+     * Raise user_warning in case of not feasible */
+    int check_feasible_vscale(int vector_bits, const std::set<int> &lanes_used, const std::string &simple_name);
+
     /** Nodes for which we want to emit specific ARM vector intrinsics */
     // @{
     void visit(const Cast *) override;
@@ -210,6 +218,7 @@ class CodeGen_ARM : public CodeGen_Posix {
     void visit(const Call *) override;
     void visit(const LT *) override;
     void visit(const LE *) override;
+    Value *call_overloaded_intrin(const Type &result_type, const std::string &name, const std::vector<Expr> &args) override;
 
     llvm::Type *get_vector_type_from_value(llvm::Value *vec_or_scalar, int n);
     Value *concat_vectors(const std::vector<llvm::Value *> &) override;
@@ -255,12 +264,14 @@ class CodeGen_ARM : public CodeGen_Posix {
     string mattrs() const override;
     bool use_soft_float_abi() const override;
     int native_vector_bits() const override;
-    int target_vscale() const override;
+    int target_vscale() const override {
+        return feasible_vscale;
+    }
 
     // NEON can be disabled for older processors.
     bool simd_intrinsics_disabled() {
         return target.has_feature(Target::NoNEON) &&
-               !target.has_feature(Target::SVE2);
+               target_vscale() == 0;
     }
 
     bool is_float16_and_has_feature(const Type &t) const {
@@ -282,6 +293,11 @@ class CodeGen_ARM : public CodeGen_Posix {
     }
 
     friend struct DecomposeVectorShuffle<CodeGen_ARM, Value *>;
+
+    int feasible_vscale = 0;
+    IntrinsicsMap intrinsics_neon;
+    IntrinsicsMap intrinsics_sve2;
+    IntrinsicsMap *effective_intrinsics;
 };
 
 CodeGen_ARM::CodeGen_ARM(const Target &target)
@@ -917,13 +933,18 @@ const std::map<string, string> float16_transcendental_remapping = {
 
 llvm::Type *CodeGen_ARM::llvm_type_with_constraint(const Type &t, bool scalars_are_vectors,
                                                    VectorTypeConstraint constraint) {
+    return llvm_type_with_constraint(t, scalars_are_vectors, constraint, target_vscale());
+}
+
+llvm::Type *CodeGen_ARM::llvm_type_with_constraint(const Type &t, bool scalars_are_vectors,
+                                                   VectorTypeConstraint constraint, int vscale) {
     llvm::Type *ret = llvm_type_of(t.element_of());
     if (!t.is_scalar() || scalars_are_vectors) {
         int lanes = t.lanes();
         if (constraint == VectorTypeConstraint::VScale) {
-            lanes /= target_vscale();
+            lanes /= vscale;
         }
-        ret = get_vector_type(ret, lanes, constraint);
+        ret = VectorType::get(ret, lanes, constraint == VectorTypeConstraint::VScale);
     }
     return ret;
 }
@@ -933,11 +954,13 @@ llvm::Function *CodeGen_ARM::define_intrin_wrapper(const std::string &inner_name
                                                    const std::string &mangled_name,
                                                    const std::vector<Type> &arg_types,
                                                    int intrinsic_flags,
-                                                   bool sve_intrinsic) {
+                                                   bool sve_intrinsic,
+                                                   int vscale) {
 
     auto to_llvm_type = [&](const Type &t) {
         return llvm_type_with_constraint(t, (intrinsic_flags & ArmIntrinsic::ScalarsAreVectors),
-                                         !sve_intrinsic ? VectorTypeConstraint::Fixed : VectorTypeConstraint::VScale);
+                                         sve_intrinsic ? VectorTypeConstraint::VScale : VectorTypeConstraint::Fixed,
+                                         vscale);
     };
 
     llvm::Type *llvm_ret_type = to_llvm_type(ret_type);
@@ -999,6 +1022,8 @@ llvm::Function *CodeGen_ARM::define_intrin_wrapper(const std::string &inner_name
 
     if (split_arg0) {
         // Call the real intrinsic.
+        internal_assert(!sve_intrinsic)
+            << "Cannot handle slice_vector for sve_intrinsic as vscale has not been set at this point\n";
         Value *low = slice_vector(wrapper->getArg(0), 0, inner_lanes);
         Value *high = slice_vector(wrapper->getArg(0), inner_lanes, inner_lanes);
         inner_args.push_back(low);
@@ -1079,7 +1104,23 @@ void CodeGen_ARM::init_module() {
         // scaled, and one of two opcodes may be selected by different
         // iterations of this loop.
         for (const auto flavor : flavors) {
-            const bool is_sve = (flavor == SIMDFlavors::SVE);
+            const bool is_sve = flavor == SIMDFlavors::SVE;
+
+            int vscale = 0;
+            IntrinsicsMap *intrinsics_map = nullptr;
+            switch (flavor) {
+            case SIMDFlavors::NeonWidthX1:
+            case SIMDFlavors::NeonWidthX2:
+                intrinsics_map = &intrinsics_neon;
+                break;
+            case SIMDFlavors::SVE:
+                vscale = target.vector_bits / 128;
+                intrinsics_map = &intrinsics_sve2;
+                break;
+            default:
+                internal_error << "unreachable\n";
+                break;
+            }
 
             // Skip intrinsics that are NEON or SVE only depending on whether compiling for SVE.
             if (is_sve) {
@@ -1125,7 +1166,7 @@ void CodeGen_ARM::init_module() {
                     break;
                 case SIMDFlavors::SVE:
                     width_factor = (intrin.flags & ArmIntrinsic::HalfWidth) ? 2 : 1;
-                    width_factor *= target_vscale();
+                    width_factor *= vscale;
                     break;
                 }
             }
@@ -1165,7 +1206,7 @@ void CodeGen_ARM::init_module() {
                 }
                 for (const Type &t : types) {
                     std::string llvm_vector_prefix = is_sve ? ".nxv" : ".v";
-                    int mangle_lanes = t.lanes() / (is_sve ? target_vscale() : 1);
+                    int mangle_lanes = t.lanes() / (is_sve ? vscale : 1);
                     mangled_name_builder << llvm_vector_prefix << mangle_lanes;
                     if (t.is_int() || t.is_uint()) {
                         mangled_name_builder << "i";
@@ -1179,26 +1220,111 @@ void CodeGen_ARM::init_module() {
 
             llvm::Function *intrin_impl = define_intrin_wrapper(
                 intrin.name, ret_type, mangled_name, arg_types,
-                intrin.flags, is_sve);
+                intrin.flags, is_sve, vscale);
 
             function_does_not_access_memory(intrin_impl);
             intrin_impl->addFnAttr(llvm::Attribute::NoUnwind);
-            declare_intrin_overload(intrin.name, ret_type, intrin_impl, arg_types);
+            (*intrinsics_map)[intrin.name].emplace_back(ret_type, arg_types, intrin_impl);
+
             if (intrin.flags & ArmIntrinsic::AllowUnsignedOp1) {
                 // Also generate a version of this intrinsic where the second operand is unsigned.
                 arg_types[1] = arg_types[1].with_code(halide_type_uint);
-                declare_intrin_overload(intrin.name, ret_type, intrin_impl, arg_types);
+                (*intrinsics_map)[intrin.name].emplace_back(ret_type, arg_types, intrin_impl);
             }
         }
     }
 }
 
+// Traverse the IR graph and gather lanes of vector type.
+// Note: we could derive this from CodeGen_C::TypeInfoGatherer
+class VectorLanesGatherer : public IRGraphVisitor {
+private:
+    using IRGraphVisitor::include;
+    using IRGraphVisitor::visit;
+
+    void include_lanes(const Type &t) {
+        if (t.is_vector()) {
+            if (!t.is_handle()) {
+                // Vector-handle types can be seen when processing (e.g.)
+                // require() statements that are vectorized, but they
+                // will all be scalarized away prior to use, so don't emit
+                // them.
+                lanes_used.insert(t.lanes());
+            }
+        }
+    }
+
+protected:
+    void include(const Expr &e) override {
+        include_lanes(e.type());
+        IRGraphVisitor::include(e);
+    }
+
+    void visit(const Ramp *op) override {
+        include_lanes(op->type.with_lanes(op->lanes));
+        IRGraphVisitor::visit(op);
+    }
+
+    void visit(const Broadcast *op) override {
+        include_lanes(op->type.with_lanes(op->lanes));
+        IRGraphVisitor::visit(op);
+    }
+
+    void visit(const Call *op) override {
+        include_lanes(op->type);
+        if (op->is_intrinsic()) {
+            Expr lowered = lower_intrinsic(op);
+            if (lowered.defined()) {
+                lowered.accept(this);
+                return;
+            }
+        }
+
+        IRGraphVisitor::visit(op);
+    }
+
+public:
+    std::set<int> lanes_used;
+};
+
 void CodeGen_ARM::compile_func(const LoweredFunc &f,
                                const string &simple_name,
                                const string &extern_name) {
 
     LoweredFunc func = f;
 
+    if (target.os != Target::IOS && target.os != Target::OSX) {
+        // Substitute in strided loads to get vld2/3/4 emission. We don't do it
+        // on Apple silicon, because doing a dense load and then shuffling is
+        // actually faster.
+        func.body = SubstituteInStridedLoads()(func.body);
+    }
+    // Look for opportunities to turn a + (b << c) into umlal/smlal
+    // and a - (b << c) into umlsl/smlsl.
+    func.body = distribute_shifts(func.body, /* multiply_adds */ true);
+
+    // Inspect vector lanes used in this function to determine feasible vscale.
+    // TODO: Target::SVE not supported https://github.com/halide/Halide/issues/8872
+    feasible_vscale = 0;
+    if (target.features_any_of({Target::SVE2})) {
+        VectorLanesGatherer vector_lanes_gatherer;
+        func.body.accept(&vector_lanes_gatherer);
+        feasible_vscale = check_feasible_vscale(target.vector_bits, vector_lanes_gatherer.lanes_used, simple_name);
+    }
+
+    if (feasible_vscale > 0) {
+        // Add attribute vscale_range
+        llvm::Function *llvm_func = module->getFunction(extern_name);
+        internal_assert(llvm_func);
+        llvm_func->addFnAttr(llvm::Attribute::getWithVScaleRangeArgs(
+            module->getContext(), feasible_vscale, feasible_vscale));
+    }
+
+    // Select intrinsics map for neon or sve2, depending on vscale
+    effective_intrinsics = feasible_vscale > 0 ? &intrinsics_sve2 : &intrinsics_neon;
+
+    CodeGen_Posix::set_effective_vscale(feasible_vscale);
+
     // Make sure run-time vscale is equal to compile-time vscale.
     // Avoiding the assert on inner functions is both an efficiency and a correctness issue
     // as the assertion code may not compile in all contexts.
@@ -1213,17 +1339,38 @@ void CodeGen_ARM::compile_func(const LoweredFunc &f,
         }
     }
 
-    if (target.os != Target::IOS && target.os != Target::OSX) {
-        // Substitute in strided loads to get vld2/3/4 emission. We don't do it
-        // on Apple silicon, because doing a dense load and then shuffling is
-        // actually faster.
-        func.body = SubstituteInStridedLoads()(func.body);
+    CodeGen_Posix::compile_func(func, simple_name, extern_name);
+}
+
+int CodeGen_ARM::check_feasible_vscale(int vector_bits, const std::set<int> &lanes_used, const std::string &simple_name) {
+    internal_assert(vector_bits != 0 && (vector_bits % 128) == 0);
+    int vscale = vector_bits / 128;
+    bool feasible = true;
+
+    for (const auto lanes : lanes_used) {
+        if (lanes <= 1) {
+            // Accept scalar
+            continue;
+        }
+
+        // Check what llvm vector type <vscale x N x ty> could be emitted.
+        // 1. N must be integer based on LLVM IR spec.
+        // 2. N of odd number is excluded for now because LLVM aarch64 backend has some issue.
+        if ((lanes % vscale) || (lanes / vscale % 2)) {
+            feasible = false;
+            break;
+        }
     }
-    // Look for opportunities to turn a + (b << c) into umlal/smlal
-    // and a - (b << c) into umlsl/smlsl.
-    func.body = distribute_shifts(func.body, /* multiply_adds */ true);
 
-    CodeGen_Posix::compile_func(func, simple_name, extern_name);
+    if (!feasible) {
+        user_warning << "In " << simple_name
+                     << ", Vectorization factor is not suitable of scalable vector with "
+                     << "vector_bits=" << vector_bits
+                     << ". Disabling SVE\n";
+        return 0;
+    }
+
+    return vscale;
 }
 
 void CodeGen_ARM::visit(const Cast *op) {
@@ -1265,7 +1412,7 @@ void CodeGen_ARM::visit(const Cast *op) {
     // LLVM fptoui generates fcvtzs or fcvtzu in inconsistent way
     if (op->value.type().is_float() &&
         op->type.is_int_or_uint() &&
-        !target.has_feature(Target::SVE2)) {
+        target_vscale() == 0) {
         if (Value *v = call_overloaded_intrin(op->type, "fp_to_int", {op->value})) {
             value = v;
             return;
@@ -1435,7 +1582,7 @@ void CodeGen_ARM::visit(const Sub *op) {
 void CodeGen_ARM::visit(const Min *op) {
     // Use a 2-wide vector for scalar floats.
     if (!simd_intrinsics_disabled() &&
-        ((op->type.is_float() && !target.has_feature(Target::SVE2)) ||
+        ((op->type.is_float() && target_vscale() == 0) ||
          op->type.is_vector())) {
         value = call_overloaded_intrin(op->type, "min", {op->a, op->b});
         if (value) {
@@ -1449,7 +1596,7 @@ void CodeGen_ARM::visit(const Min *op) {
 void CodeGen_ARM::visit(const Max *op) {
     // Use a 2-wide vector for scalar floats.
     if (!simd_intrinsics_disabled() &&
-        ((op->type.is_float() && !target.has_feature(Target::SVE2)) ||
+        ((op->type.is_float() && target_vscale() == 0) ||
          op->type.is_vector())) {
         value = call_overloaded_intrin(op->type, "max", {op->a, op->b});
         if (value) {
@@ -1463,7 +1610,7 @@ void CodeGen_ARM::visit(const Max *op) {
 void CodeGen_ARM::visit(const Store *op) {
     // Predicated store
     const bool is_predicated_store = !is_const_one(op->predicate);
-    if (is_predicated_store && !target.has_feature(Target::SVE2)) {
+    if (is_predicated_store && target_vscale() == 0) {
         CodeGen_Posix::visit(op);
         return;
     }
@@ -1477,7 +1624,7 @@ void CodeGen_ARM::visit(const Store *op) {
     const Ramp *ramp = op->index.as<Ramp>();
 
     // We only deal with ramps here except for SVE2
-    if (!ramp && !target.has_feature(Target::SVE2)) {
+    if (!ramp && target_vscale() == 0) {
         CodeGen_Posix::visit(op);
         return;
     }
@@ -1503,14 +1650,12 @@ void CodeGen_ARM::visit(const Store *op) {
             is_float16_and_has_feature(elt) ||
             elt == Int(8) || elt == Int(16) || elt == Int(32) || elt == Int(64) ||
             elt == UInt(8) || elt == UInt(16) || elt == UInt(32) || elt == UInt(64)) {
-            const int target_vector_bits = native_vector_bits();
-            if (vec_bits % 128 == 0) {
+            if (vec_bits % 128 == 0 || target_vscale() > 0) {
                 type_ok_for_vst = true;
-                intrin_type = intrin_type.with_lanes(target_vector_bits / t.bits());
+                intrin_type = intrin_type.with_lanes(native_vector_bits() / t.bits());
             } else if (vec_bits % 64 == 0) {
                 type_ok_for_vst = true;
-                auto intrin_bits = (vec_bits % 128 == 0 || target.has_feature(Target::SVE2)) ? target_vector_bits : 64;
-                intrin_type = intrin_type.with_lanes(intrin_bits / t.bits());
+                intrin_type = intrin_type.with_lanes(64 / t.bits());
             }
         }
     }
@@ -1540,7 +1685,7 @@ void CodeGen_ARM::visit(const Store *op) {
             args[i] = codegen(shuffle->vectors[i]);
         }
 
-        bool is_sve = target.has_feature(Target::SVE2);
+        bool is_sve = (target_vscale() > 0);
 
         // Declare the function
         std::ostringstream instr;
@@ -1636,12 +1781,12 @@ void CodeGen_ARM::visit(const Store *op) {
         return;
     }
 
-    if (target.has_feature(Target::SVE2)) {
+    if (target_vscale() > 0) {
         const IntImm *stride = ramp ? ramp->stride.as<IntImm>() : nullptr;
         if (stride && stride->value == 1) {
             // Basically we can deal with vanilla codegen,
             // but to avoid LLVM error, process with the multiple of natural_lanes
-            const int natural_lanes = target.natural_vector_size(op->value.type());
+            const int natural_lanes = natural_vector_size(op->value.type());
             if (ramp->lanes % natural_lanes && !emit_atomic_stores) {
                 int aligned_lanes = align_up(ramp->lanes, natural_lanes);
                 // Use predicate to prevent overrun
@@ -1673,7 +1818,7 @@ void CodeGen_ARM::visit(const Store *op) {
             const int index_bits = 32;
             Type type_with_max_bits = Int(std::max(elt.bits(), index_bits));
             // The number of lanes is constrained by index vector type
-            const int natural_lanes = target.natural_vector_size(type_with_max_bits);
+            const int natural_lanes = natural_vector_size(type_with_max_bits);
             const int vscale_natural_lanes = natural_lanes / target_vscale();
 
             Expr base = 0;
@@ -1759,7 +1904,7 @@ void CodeGen_ARM::visit(const Store *op) {
 void CodeGen_ARM::visit(const Load *op) {
     // Predicated load
     const bool is_predicated_load = !is_const_one(op->predicate);
-    if (is_predicated_load && !target.has_feature(Target::SVE2)) {
+    if (is_predicated_load && target_vscale() == 0) {
         CodeGen_Posix::visit(op);
         return;
     }
@@ -1772,7 +1917,7 @@ void CodeGen_ARM::visit(const Load *op) {
     const Ramp *ramp = op->index.as<Ramp>();
 
     // We only deal with ramps here
-    if (!ramp && !target.has_feature(Target::SVE2)) {
+    if (!ramp && target_vscale() == 0) {
         CodeGen_Posix::visit(op);
         return;
     }
@@ -1780,7 +1925,7 @@ void CodeGen_ARM::visit(const Load *op) {
     // If the stride is in [-1, 1], we can deal with that using vanilla codegen
     const IntImm *stride = ramp ? ramp->stride.as<IntImm>() : nullptr;
     if (stride && (-1 <= stride->value && stride->value <= 1) &&
-        !target.has_feature(Target::SVE2)) {
+        target_vscale() == 0) {
         CodeGen_Posix::visit(op);
         return;
     }
@@ -1806,12 +1951,12 @@ void CodeGen_ARM::visit(const Load *op) {
         }
     }
 
-    if (target.has_feature(Target::SVE2)) {
+    if ((target_vscale() > 0)) {
         if (stride && stride->value < 1) {
             CodeGen_Posix::visit(op);
             return;
         } else if (stride && stride->value == 1) {
-            const int natural_lanes = target.natural_vector_size(op->type);
+            const int natural_lanes = natural_vector_size(op->type);
             if (ramp->lanes % natural_lanes) {
                 // Load with lanes multiple of natural_lanes
                 int aligned_lanes = align_up(ramp->lanes, natural_lanes);
@@ -1849,7 +1994,7 @@ void CodeGen_ARM::visit(const Load *op) {
             const int index_bits = 32;
             Type type_with_max_bits = Int(std::max(elt.bits(), index_bits));
             // The number of lanes is constrained by index vector type
-            const int natural_lanes = target.natural_vector_size(type_with_max_bits);
+            const int natural_lanes = natural_vector_size(type_with_max_bits);
             const int vscale_natural_lanes = natural_lanes / target_vscale();
 
             Expr base = 0;
@@ -2460,6 +2605,10 @@ void CodeGen_ARM::visit(const Call *op) {
     CodeGen_Posix::visit(op);
 }
 
+Value *CodeGen_ARM::call_overloaded_intrin(const Type &result_type, const std::string &name, const std::vector<Expr> &args) {
+    return CodeGen_Posix::call_overloaded_intrin(result_type, name, args, *effective_intrinsics);
+}
+
 void CodeGen_ARM::visit(const LT *op) {
     if (op->a.type().is_float() && op->type.is_vector()) {
         // Fast-math flags confuse LLVM's aarch64 backend, so
@@ -2610,13 +2759,13 @@ bool CodeGen_ARM::codegen_pairwise_vector_reduce(const VectorReduce *op, const E
             narrow = lossless_cast(narrow_type.with_code(Type::UInt), op->value);
         }
         if (narrow.defined()) {
-            if (init.defined() && (target.bits == 32 || target.has_feature(Target::SVE2))) {
+            if (init.defined() && (target.bits == 32 || (target_vscale() > 0))) {
                 // On 32-bit or SVE2, we have an intrinsic for widening add-accumulate.
                 // TODO: this could be written as a pattern with widen_right_add (#6951).
                 intrin = "pairwise_widening_add_accumulate";
                 intrin_args = {accumulator, narrow};
                 accumulator = Expr();
-            } else if (target.has_feature(Target::SVE2)) {
+            } else if (target_vscale() > 0) {
                 intrin = "pairwise_widening_add_accumulate";
                 intrin_args = {Expr(0), narrow};
                 accumulator = Expr();
@@ -2626,15 +2775,15 @@ bool CodeGen_ARM::codegen_pairwise_vector_reduce(const VectorReduce *op, const E
                 intrin = "pairwise_widening_add";
                 intrin_args = {narrow};
             }
-        } else if (!target.has_feature(Target::SVE2)) {
+        } else if (target_vscale() == 0) {
             // Exclude SVE, as it process lanes in different order (even/odd wise) than NEON
             intrin = "pairwise_add";
             intrin_args = {op->value};
         }
-    } else if (op->op == VectorReduce::Min && factor == 2 && !target.has_feature(Target::SVE2)) {
+    } else if (op->op == VectorReduce::Min && factor == 2 && target_vscale() == 0) {
         intrin = "pairwise_min";
         intrin_args = {op->value};
-    } else if (op->op == VectorReduce::Max && factor == 2 && !target.has_feature(Target::SVE2)) {
+    } else if (op->op == VectorReduce::Max && factor == 2 && target_vscale() == 0) {
         intrin = "pairwise_max";
         intrin_args = {op->value};
     }
@@ -2685,7 +2834,7 @@ bool CodeGen_ARM::codegen_across_vector_reduce(const VectorReduce *op, const Exp
 
     Expr val = op->value;
     const int output_lanes = op->type.lanes();
-    const int native_lanes = target.natural_vector_size(op->type);
+    const int native_lanes = natural_vector_size(op->type);
     const int input_lanes = val.type().lanes();
 
     if (output_lanes != 1 || input_lanes < 2) {
@@ -2871,16 +3020,6 @@ int CodeGen_ARM::native_vector_bits() const {
     return std::max(target_vscale(), 1) * 128;
 }
 
-int CodeGen_ARM::target_vscale() const {
-    // TODO: https://github.com/halide/Halide/issues/8872
-    // if (target.features_any_of({Target::SVE, Target::SVE2})) {
-    if (target.has_feature(Target::SVE2)) {
-        return target.vector_bits / 128;
-    }
-
-    return 0;
-}
-
 bool CodeGen_ARM::supports_call_as_float16(const Call *op) const {
     bool is_fp16_native = float16_native_funcs.find(op->name) != float16_native_funcs.end();
     bool is_fp16_transcendental = float16_transcendental_remapping.find(op->name) != float16_transcendental_remapping.end();
diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
index 98837d27b0d4..a91b2c76953f 100644
--- a/src/CodeGen_LLVM.cpp
+++ b/src/CodeGen_LLVM.cpp
@@ -1395,6 +1395,10 @@ Type CodeGen_LLVM::upgrade_type_for_storage(const Type &t) const {
     }
 }
 
+void CodeGen_LLVM::set_effective_vscale(int vscale) {
+    effective_vscale = vscale;
+}
+
 void CodeGen_LLVM::visit(const IntImm *op) {
     value = ConstantInt::getSigned(llvm_type_of(op->type), op->value);
 }
@@ -4643,6 +4647,12 @@ void CodeGen_LLVM::declare_intrin_overload(const std::string &name, const Type &
 }
 
 Value *CodeGen_LLVM::call_overloaded_intrin(const Type &result_type, const std::string &name, const std::vector<Expr> &args) {
+    return call_overloaded_intrin(result_type, name, args, intrinsics);
+}
+
+Value *CodeGen_LLVM::call_overloaded_intrin(const Type &result_type, const std::string &name, const std::vector<Expr> &args,
+                                            const IntrinsicsMap &overloaded_intrinsics) {
+
     constexpr int debug_level = 4;
 
     debug(debug_level) << "call_overloaded_intrin: " << result_type << " " << name << "(";
@@ -4653,8 +4663,8 @@ Value *CodeGen_LLVM::call_overloaded_intrin(const Type &result_type, const std::
     }
     debug(debug_level) << ")\n";
 
-    auto impls_i = intrinsics.find(name);
-    if (impls_i == intrinsics.end()) {
+    const auto impls_i = overloaded_intrinsics.find(name);
+    if (impls_i == overloaded_intrinsics.end()) {
         debug(debug_level) << "No intrinsic " << name << "\n";
         return nullptr;
     }
diff --git a/src/CodeGen_LLVM.h b/src/CodeGen_LLVM.h
index 240114977f82..68c3672d90fd 100644
--- a/src/CodeGen_LLVM.h
+++ b/src/CodeGen_LLVM.h
@@ -164,6 +164,8 @@ class CodeGen_LLVM : public IRVisitor {
      * of functions as. */
     virtual Type upgrade_type_for_argument_passing(const Type &) const;
 
+    void set_effective_vscale(int vscale);
+
     std::unique_ptr<llvm::Module> module;
     llvm::Function *function = nullptr;
     llvm::LLVMContext *context = nullptr;
@@ -474,8 +476,9 @@ class CodeGen_LLVM : public IRVisitor {
             : result_type(result_type), arg_types(std::move(arg_types)), impl(impl) {
         }
     };
+    using IntrinsicsMap = std::map<std::string, std::vector<Intrinsic>>;
     /** Mapping of intrinsic functions to the various overloads implementing it. */
-    std::map<std::string, std::vector<Intrinsic>> intrinsics;
+    IntrinsicsMap intrinsics;
 
     /** Get an LLVM intrinsic declaration. If it doesn't exist, it will be created. */
     llvm::Function *get_llvm_intrin(const Type &ret_type, const std::string &name, const std::vector<Type> &arg_types, bool scalars_are_vectors = false);
@@ -484,7 +487,11 @@ class CodeGen_LLVM : public IRVisitor {
     llvm::Function *declare_intrin_overload(const std::string &name, const Type &ret_type, const std::string &impl_name, std::vector<Type> arg_types, bool scalars_are_vectors = false);
     void declare_intrin_overload(const std::string &name, const Type &ret_type, llvm::Function *impl, std::vector<Type> arg_types);
     /** Call an overloaded intrinsic function. Returns nullptr if no suitable overload is found. */
-    llvm::Value *call_overloaded_intrin(const Type &result_type, const std::string &name, const std::vector<Expr> &args);
+    virtual llvm::Value *call_overloaded_intrin(const Type &result_type, const std::string &name, const std::vector<Expr> &args);
+    /** Call an overloaded intrinsic function. Returns nullptr if no suitable overload is found.
+     * Look up the given overloaded_intrinsics map for the corresponding intrin */
+    llvm::Value *call_overloaded_intrin(const Type &result_type, const std::string &name, const std::vector<Expr> &args,
+                                        const IntrinsicsMap &overloaded_intrinsics);
 
     /** Generate a call to a vector intrinsic or runtime inlined
      * function. The arguments are sliced up into vectors of the width
diff --git a/test/correctness/CMakeLists.txt b/test/correctness/CMakeLists.txt
index a25b077a1abb..186dda419e37 100644
--- a/test/correctness/CMakeLists.txt
+++ b/test/correctness/CMakeLists.txt
@@ -110,6 +110,7 @@ tests(GROUPS correctness
       extern_stage_on_device.cpp
       extract_concat_bits.cpp
       failed_unroll.cpp
+      fallback_vscale_sve.cpp
       fast_trigonometric.cpp
       fibonacci.cpp
       fit_function.cpp
diff --git a/test/correctness/fallback_vscale_sve.cpp b/test/correctness/fallback_vscale_sve.cpp
new file mode 100644
index 000000000000..e8110e910339
--- /dev/null
+++ b/test/correctness/fallback_vscale_sve.cpp
@@ -0,0 +1,83 @@
+#include "Halide.h"
+#include <fstream>
+#include <regex>
+
+using namespace Halide;
+
+bool compile_and_check_vscale(Func &f,
+                              const std::string &name,
+                              const Target &t,
+                              int exp_vscale,
+                              const std::string &exp_intrin) {
+
+    // Look into llvm-ir and check function attributes for vscale_range
+    auto llvm_file_name = name + ".ll";
+    f.compile_to_llvm_assembly(llvm_file_name, f.infer_arguments(), t);
+
+    Internal::assert_file_exists(llvm_file_name);
+    std::ifstream llvm_file;
+    llvm_file.open(llvm_file_name);
+    std::string line;
+    // Pattern to extract "n" and "m" in "vscale_range(n,m)"
+    std::regex vscale_regex(R"(vscale_range\(\s*([0-9]+)\s*,\s*([0-9]+)\s*\))");
+
+    int act_vscale = 0;
+    bool intrin_found = false;
+
+    while (getline(llvm_file, line)) {
+        // Check vscale_range
+        std::smatch match;
+        if (std::regex_search(line, match, vscale_regex) && match[1] == match[2]) {
+            act_vscale = std::stoi(match[1]);
+        }
+        // Check intrin
+        if (line.find(exp_intrin) != std::string::npos) {
+            intrin_found = true;
+        }
+    }
+
+    if (act_vscale != exp_vscale) {
+        printf("[%s] Found vscale_range %d, while expected %d\n", name.c_str(), act_vscale, exp_vscale);
+        return false;
+    }
+    if (!intrin_found) {
+        printf("[%s] Cannot find expected intrin %s\n", name.c_str(), exp_intrin.c_str());
+        return false;
+    }
+    return true;
+}
+
+Var x("x"), y("y");
+
+bool test_vscale(int vectorization_factor, int vector_bits, int exp_vscale) {
+    Func f("f");
+    f(x, y) = absd(x, y);
+    f.compute_root().vectorize(x, vectorization_factor);
+
+    Target t("arm-64-linux-sve2-no_asserts-no_runtime-no_bounds_query");
+    t.vector_bits = vector_bits;
+
+    std::stringstream name;
+    name << "test_vscale_v" << vectorization_factor << "_vector_bits_" << vector_bits;
+
+    // sve or neon
+    std::string intrin = exp_vscale > 0 ? "llvm.aarch64.sve.sabd" : "llvm.aarch64.neon.sabd";
+
+    return compile_and_check_vscale(f, name.str(), t, exp_vscale, intrin);
+}
+
+int main(int argc, char **argv) {
+
+    bool ok = true;
+
+    ok &= test_vscale(4, 128, 1);  // Regular case: <vscale x 4 x ty> with vscale=1
+    ok &= test_vscale(3, 128, 0);  // Fallback due to odd vectorization factor
+    ok &= test_vscale(8, 512, 4);  // Regular case: <vscale x 2 x ty> with vscale=4
+    ok &= test_vscale(4, 512, 0);  // Fallback due to <vscale x 1 x ty>
+
+    if (!ok) {
+        return 1;
+    }
+    printf("Success!\n");
+    return 0;
+}
diff --git a/test/warning/CMakeLists.txt b/test/warning/CMakeLists.txt
index ffaa35b367c8..248c0d64261a 100644
--- a/test/warning/CMakeLists.txt
+++ b/test/warning/CMakeLists.txt
@@ -4,6 +4,7 @@ tests(GROUPS warning
       require_const_false.cpp
       sliding_vectors.cpp
       unscheduled_update_def.cpp
+      unsupported_vectorization_sve.cpp
       emulated_float16.cpp
       )
 
diff --git a/test/warning/unsupported_vectorization_sve.cpp b/test/warning/unsupported_vectorization_sve.cpp
new file mode 100644
index 000000000000..3fa9c2602804
--- /dev/null
+++ b/test/warning/unsupported_vectorization_sve.cpp
@@ -0,0 +1,23 @@
+#include "Halide.h"
+#include "halide_test_dirs.h"
+
+using namespace Halide;
+
+int main(int argc, char **argv) {
+    Func f;
+    Var x;
+
+    f(x) = x * 0.1f;
+
+    constexpr int vscale = 2;
+    constexpr int vector_bits = 128 * vscale;
+
+    f.vectorize(x, vscale * 3);
+    Target t("arm-64-linux-sve2-vector_bits_" + std::to_string(vector_bits));
+
+    // SVE is disabled with user_warning,
+    // which would have ended up with emitting <vscale x 3 x float> if we didn't.
+    f.compile_to_llvm_assembly(Internal::get_test_tmp_dir() + "unused.ll", f.infer_arguments(), "f", t);
+
+    return 0;
+}