diff --git a/libvmaf/src/feature/feature_extractor.c b/libvmaf/src/feature/feature_extractor.c
index ef17b901f..49a41892c 100644
--- a/libvmaf/src/feature/feature_extractor.c
+++ b/libvmaf/src/feature/feature_extractor.c
@@ -46,6 +46,7 @@ extern VmafFeatureExtractor vmaf_fex_psnr;
 extern VmafFeatureExtractor vmaf_fex_psnr_hvs;
 extern VmafFeatureExtractor vmaf_fex_integer_adm;
 extern VmafFeatureExtractor vmaf_fex_integer_motion;
+extern VmafFeatureExtractor vmaf_fex_integer_motion_v2;
 extern VmafFeatureExtractor vmaf_fex_integer_vif;
 extern VmafFeatureExtractor vmaf_fex_cambi;
 #if HAVE_CUDA
@@ -71,6 +72,7 @@ static VmafFeatureExtractor *feature_extractor_list[] = {
     &vmaf_fex_psnr_hvs,
     &vmaf_fex_integer_adm,
     &vmaf_fex_integer_motion,
+    &vmaf_fex_integer_motion_v2,
     &vmaf_fex_integer_vif,
     &vmaf_fex_cambi,
 #if HAVE_CUDA
diff --git a/libvmaf/src/feature/feature_extractor.h b/libvmaf/src/feature/feature_extractor.h
index 574436e76..6d8cb7f11 100644
--- a/libvmaf/src/feature/feature_extractor.h
+++ b/libvmaf/src/feature/feature_extractor.h
@@ -38,6 +38,7 @@ enum VmafFeatureExtractorFlags {
     VMAF_FEATURE_EXTRACTOR_TEMPORAL = 1 << 0,
     VMAF_FEATURE_EXTRACTOR_CUDA = 1 << 1,
     VMAF_FEATURE_FRAME_SYNC = 1 << 2,
+    VMAF_FEATURE_EXTRACTOR_PREV_REF = 1 << 3,
 };
 
 typedef struct VmafFeatureExtractor {
@@ -97,6 +98,7 @@ typedef struct VmafFeatureExtractor {
     #endif
 
     VmafFrameSyncContext *framesync;
+    VmafPicture prev_ref; ///< Previous reference picture, set by framework.
 
 } VmafFeatureExtractor;
 
diff --git a/libvmaf/src/feature/integer_motion_v2.c b/libvmaf/src/feature/integer_motion_v2.c
new file mode 100644
index 000000000..85c3b1772
--- /dev/null
+++ b/libvmaf/src/feature/integer_motion_v2.c
@@ -0,0 +1,291 @@
+/**
+ *
+ *  Copyright 2016-2025 Netflix, Inc.
+ *
+ *     Licensed under the BSD+Patent License (the "License");
+ *     you may not use this file except in compliance with the License.
+ *     You may obtain a copy of the License at
+ *
+ *         https://opensource.org/licenses/BSDplusPatent
+ *
+ *     Unless required by applicable law or agreed to in writing, software
+ *     distributed under the License is distributed on an "AS IS" BASIS,
+ *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *     See the License for the specific language governing permissions and
+ *     limitations under the License.
+ *
+ */
+
+// Pipelined motion feature extractor.
+//
+// Computes the same motion score as integer_motion.c but without storing
+// blurred frames across extract calls. Instead, it exploits the linearity
+// of convolution: SAD(blur[N-1], blur[N]) == sum(|blur(f[N-1] - f[N])|).
+//
+// The frame difference, blur, and absolute-sum are fused into a single
+// row-at-a-time pipeline, requiring only one row of scratch memory.
+//
+// The framework provides the previous reference frame via fex->prev_ref,
+// making each extract call stateless with respect to pixel data.
+
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "cpu.h"
+#include "feature_collector.h"
+#include "feature_extractor.h"
+#include "integer_motion.h"
+
+#if ARCH_X86
+#include "x86/motion_v2_avx2.h"
+#if HAVE_AVX512
+#include "x86/motion_v2_avx512.h"
+#endif
+#endif
+
+typedef uint64_t (*motion_pipeline_fn)(const uint8_t *, ptrdiff_t,
+                                       const uint8_t *, ptrdiff_t,
+                                       int32_t *, unsigned, unsigned,
+                                       unsigned bpc);
+
+typedef struct MotionV2State {
+    int32_t *y_row;
+    unsigned w, h, bpc;
+    motion_pipeline_fn pipeline;
+} MotionV2State;
+
+static inline int mirror(int idx, int size)
+{
+    if (idx < 0) return -idx;
+    if (idx >= size) return 2 * size - idx - 1;
+    return idx;
+}
+
+static uint64_t
+motion_score_pipeline_8(const uint8_t *prev, ptrdiff_t prev_stride,
+                        const uint8_t *cur, ptrdiff_t cur_stride,
+                        int32_t *y_row, unsigned w, unsigned h,
+                        unsigned bpc)
+{
+    (void)bpc;
+    const int radius = filter_width / 2;
+    const int32_t y_round = 1 << 7;
+    const int32_t x_round = 1 << 15;
+
+    uint64_t sad = 0;
+
+    for (unsigned i = 0; i < h; i++) {
+        // Fused diff + y_conv for row i (shift by 8, matching v1 precision)
+        int32_t any_nonzero = 0;
+        for (unsigned j = 0; j < w; j++) {
+            int32_t accum = 0;
+            for (int k = 0; k < filter_width; k++) {
+                const int row = mirror((int)i - radius + k, (int)h);
+                int32_t diff = prev[row * prev_stride + j]
+                             - cur[row * cur_stride + j];
+                accum += (int32_t)filter[k] * diff;
+            }
+            y_row[j] = (accum + y_round) >> 8;
+            any_nonzero |= y_row[j];
+        }
+
+        if (!any_nonzero) continue;
+
+        // x_conv + abs + accumulate for row i
+        uint32_t row_sad = 0;
+        for (unsigned j = 0; j < w; j++) {
+            int64_t accum = 0;
+            for (int k = 0; k < filter_width; k++) {
+                const int col = mirror((int)j - radius + k, (int)w);
+                accum += (int64_t)filter[k] * y_row[col];
+            }
+            int32_t val = (int32_t)((accum + x_round) >> 16);
+            row_sad += abs(val);
+        }
+        sad += row_sad;
+    }
+
+    return sad;
+}
+
+static inline uint64_t
+motion_score_pipeline_16(const uint8_t *prev_u8, ptrdiff_t prev_stride,
+                         const uint8_t *cur_u8, ptrdiff_t cur_stride,
+                         int32_t *y_row, unsigned w, unsigned h,
+                         unsigned bpc)
+{
+    const uint16_t *prev = (const uint16_t *)prev_u8;
+    const uint16_t *cur = (const uint16_t *)cur_u8;
+    const ptrdiff_t p_stride = prev_stride / 2;
+    const ptrdiff_t c_stride = cur_stride / 2;
+
+    const int radius = filter_width / 2;
+    const int32_t y_round = 1 << (bpc - 1);
+    const int32_t x_round = 1 << 15;
+
+    uint64_t sad = 0;
+
+    for (unsigned i = 0; i < h; i++) {
+        // Fused diff + y_conv for row i
+        int32_t any_nonzero = 0;
+        for (unsigned j = 0; j < w; j++) {
+            int64_t accum = 0;
+            for (int k = 0; k < filter_width; k++) {
+                const int row = mirror((int)i - radius + k, (int)h);
+                int32_t diff = prev[row * p_stride + j]
+                             - cur[row * c_stride + j];
+                accum += (int64_t)filter[k] * diff;
+            }
+            y_row[j] = (int32_t)((accum + y_round) >> bpc);
+            any_nonzero |= y_row[j];
+        }
+
+        if (!any_nonzero) continue;
+
+        // x_conv + abs + accumulate for row i
+        uint32_t row_sad = 0;
+        for (unsigned j = 0; j < w; j++) {
+            int64_t accum = 0;
+            for (int k = 0; k < filter_width; k++) {
+                const int col = mirror((int)j - radius + k, (int)w);
+                accum += (int64_t)filter[k] * y_row[col];
+            }
+            int32_t val = (int32_t)((accum + x_round) >> 16);
+            row_sad += abs(val);
+        }
+        sad += row_sad;
+    }
+
+    return sad;
+}
+
+static int init(VmafFeatureExtractor *fex, enum VmafPixelFormat pix_fmt,
+                unsigned bpc, unsigned w, unsigned h)
+{
+    (void) pix_fmt;
+    MotionV2State *s = fex->priv;
+
+    s->w = w;
+    s->h = h;
+    s->bpc = bpc;
+
+    s->y_row = malloc(sizeof(*s->y_row) * w);
+    if (!s->y_row) return -ENOMEM;
+
+    if (bpc == 8)
+        s->pipeline = motion_score_pipeline_8;
+    else
+        s->pipeline = motion_score_pipeline_16;
+
+#if ARCH_X86
+    if (vmaf_get_cpu_flags() & VMAF_X86_CPU_FLAG_AVX2) {
+        if (bpc == 8)
+            s->pipeline = motion_score_pipeline_8_avx2;
+        else
+            s->pipeline = motion_score_pipeline_16_avx2;
+    }
+#if HAVE_AVX512
+    if (vmaf_get_cpu_flags() & VMAF_X86_CPU_FLAG_AVX512) {
+        if (bpc == 8)
+            s->pipeline = motion_score_pipeline_8_avx512;
+        else
+            s->pipeline = motion_score_pipeline_16_avx512;
+    }
+#endif
+#endif
+
+    return 0;
+}
+
+static int extract(VmafFeatureExtractor *fex,
+                   VmafPicture *ref_pic, VmafPicture *ref_pic_90,
+                   VmafPicture *dist_pic, VmafPicture *dist_pic_90,
+                   unsigned index, VmafFeatureCollector *feature_collector)
+{
+    MotionV2State *s = fex->priv;
+
+    (void) dist_pic;
+    (void) ref_pic_90;
+    (void) dist_pic_90;
+
+    if (index == 0) {
+        return vmaf_feature_collector_append(feature_collector,
+                "VMAF_integer_feature_motion_v2_sad_score", 0., index);
+    }
+
+    if (!fex->prev_ref.ref)
+        return -EINVAL;
+
+    const unsigned w = s->w;
+    const unsigned h = s->h;
+    const uint8_t *prev_data = (const uint8_t *)fex->prev_ref.data[0];
+    const uint8_t *cur_data = (const uint8_t *)ref_pic->data[0];
+
+    uint64_t sad = s->pipeline(prev_data, fex->prev_ref.stride[0],
+                               cur_data, ref_pic->stride[0],
+                               s->y_row, w, h, s->bpc);
+
+    double score = (double)sad / 256. / (w * h);
+
+    return vmaf_feature_collector_append(feature_collector,
+            "VMAF_integer_feature_motion_v2_sad_score", score, index);
+}
+
+static int close_fex(VmafFeatureExtractor *fex)
+{
+    MotionV2State *s = fex->priv;
+    free(s->y_row);
+    return 0;
+}
+
+static int flush(VmafFeatureExtractor *fex,
+                 VmafFeatureCollector *feature_collector)
+{
+    (void) fex;
+
+    unsigned n_frames = 0;
+    double dummy;
+    while (!vmaf_feature_collector_get_score(feature_collector,
+            "VMAF_integer_feature_motion_v2_sad_score", &dummy, n_frames))
+        n_frames++;
+
+    if (n_frames < 2) return 1;
+
+    for (unsigned i = 0; i < n_frames; i++) {
+        double score_cur, score_next;
+        vmaf_feature_collector_get_score(feature_collector,
+            "VMAF_integer_feature_motion_v2_sad_score", &score_cur, i);
+
+        double motion2;
+        if (i + 1 < n_frames) {
+            vmaf_feature_collector_get_score(feature_collector,
+                "VMAF_integer_feature_motion_v2_sad_score", &score_next, i + 1);
+            motion2 = score_cur < score_next ? score_cur : score_next;
+        } else {
+            motion2 = score_cur;
+        }
+
+        vmaf_feature_collector_append(feature_collector,
+            "VMAF_integer_feature_motion2_v2_score", motion2, i);
+    }
+
+    return 1;
+}
+
+static const char *provided_features[] = {
+    "VMAF_integer_feature_motion_v2_sad_score",
+    "VMAF_integer_feature_motion2_v2_score",
+    NULL
+};
+
+VmafFeatureExtractor vmaf_fex_integer_motion_v2 = {
+    .name = "motion_v2",
+    .init = init,
+    .extract = extract,
+    .flush = flush,
+    .close = close_fex,
+    .priv_size = sizeof(MotionV2State),
+    .provided_features = provided_features,
+    .flags = VMAF_FEATURE_EXTRACTOR_PREV_REF,
+};
diff --git a/libvmaf/src/feature/x86/motion_v2_avx2.c b/libvmaf/src/feature/x86/motion_v2_avx2.c
new file mode 100644
index 000000000..77b5e5c17
--- /dev/null
+++ b/libvmaf/src/feature/x86/motion_v2_avx2.c
@@ -0,0 +1,335 @@
+/**
+ *
+ *  Copyright 2016-2025 Netflix, Inc.
+ *
+ *     Licensed under the BSD+Patent License (the "License");
+ *     you may not use this file except in compliance with the License.
+ *     You may obtain a copy of the License at
+ *
+ *         https://opensource.org/licenses/BSDplusPatent
+ *
+ *     Unless required by applicable law or agreed to in writing, software
+ *     distributed under the License is distributed on an "AS IS" BASIS,
+ *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *     See the License for the specific language governing permissions and
+ *     limitations under the License.
+ *
+ */
+
+#include <immintrin.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "feature/integer_motion.h"
+
+static inline int mirror(int idx, int size)
+{
+    if (idx < 0) return -idx;
+    if (idx >= size) return 2 * size - idx - 1;
+    return idx;
+}
+
+// Emulate arithmetic right shift of int64 by 16 in AVX2.
+// AVX2 lacks srai_epi64; this uses the blend trick:
+//   low dwords come from logical shift, high dwords from arithmetic shift.
+static inline __m256i srai_epi64_16(__m256i v)
+{
+    __m256i lo = _mm256_srli_epi64(v, 16);
+    __m256i hi = _mm256_srai_epi32(v, 16);
+    return _mm256_blend_epi32(lo, hi, 0xAA);
+}
+
+// SIMD phase 2: x_conv + abs + SAD for one row of int32 y_row.
+// Processes 8 int32 columns at a time via mullo_epi32 + int64 accumulation.
+static inline uint32_t
+x_conv_row_sad_avx2(const int32_t *y_row, unsigned w)
+{
+    const __m256i g0 = _mm256_set1_epi32(3571);
+    const __m256i g1 = _mm256_set1_epi32(16004);
+    const __m256i g2 = _mm256_set1_epi32(26386);
+    const __m256i round64 = _mm256_set1_epi64x(1 << 15);
+    const __m256i perm_idx = _mm256_setr_epi32(0, 2, 4, 6, 0, 0, 0, 0);
+
+    uint32_t row_sad = 0;
+
+    // Scalar left edge (columns 0, 1) — mirror boundary
+    unsigned j;
+    for (j = 0; j < 2 && j < w; j++) {
+        int64_t accum = 0;
+        for (int k = 0; k < 5; k++) {
+            int col = mirror((int)j - 2 + k, (int)w);
+            accum += (int64_t)filter[k] * y_row[col];
+        }
+        int32_t val = (int32_t)((accum + (1 << 15)) >> 16);
+        row_sad += abs(val);
+    }
+
+    // SIMD middle: need y_row[j-2]..y_row[j+9], so j+10 <= w
+    __m256i sad_acc = _mm256_setzero_si256();
+    for (; j + 10 <= w; j += 8) {
+        __m256i y0 = _mm256_loadu_si256((__m256i*)(y_row + j - 2));
+        __m256i y1 = _mm256_loadu_si256((__m256i*)(y_row + j - 1));
+        __m256i y2 = _mm256_loadu_si256((__m256i*)(y_row + j));
+        __m256i y3 = _mm256_loadu_si256((__m256i*)(y_row + j + 1));
+        __m256i y4 = _mm256_loadu_si256((__m256i*)(y_row + j + 2));
+
+        // Each product fits in int32
+        __m256i p0 = _mm256_mullo_epi32(y0, g0);
+        __m256i p1 = _mm256_mullo_epi32(y1, g1);
+        __m256i p2 = _mm256_mullo_epi32(y2, g2);
+        __m256i p3 = _mm256_mullo_epi32(y3, g1);
+        __m256i p4 = _mm256_mullo_epi32(y4, g0);
+
+        // Safe pairs that fit in int32
+        __m256i s04 = _mm256_add_epi32(p0, p4);
+        __m256i s13 = _mm256_add_epi32(p1, p3);
+
+        // Widen to int64 and accumulate (lo 4 elements)
+        __m256i acc_lo = _mm256_cvtepi32_epi64(_mm256_castsi256_si128(s04));
+        acc_lo = _mm256_add_epi64(acc_lo, _mm256_cvtepi32_epi64(_mm256_castsi256_si128(s13)));
+        acc_lo = _mm256_add_epi64(acc_lo, _mm256_cvtepi32_epi64(_mm256_castsi256_si128(p2)));
+
+        // hi 4 elements
+        __m256i acc_hi = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(s04, 1));
+        acc_hi = _mm256_add_epi64(acc_hi, _mm256_cvtepi32_epi64(_mm256_extracti128_si256(s13, 1)));
+        acc_hi = _mm256_add_epi64(acc_hi, _mm256_cvtepi32_epi64(_mm256_extracti128_si256(p2, 1)));
+
+        // Round and arithmetic right shift >>16
+        acc_lo = srai_epi64_16(_mm256_add_epi64(acc_lo, round64));
+        acc_hi = srai_epi64_16(_mm256_add_epi64(acc_hi, round64));
+
+        // Pack int64 -> int32 (gather low 32 bits of each 64-bit lane)
+        __m128i res_lo = _mm256_castsi256_si128(
+            _mm256_permutevar8x32_epi32(acc_lo, perm_idx));
+        __m128i res_hi = _mm256_castsi256_si128(
+            _mm256_permutevar8x32_epi32(acc_hi, perm_idx));
+
+        // Combine into 8 x int32, abs, accumulate
+        __m256i result = _mm256_inserti128_si256(
+            _mm256_castsi128_si256(res_lo), res_hi, 1);
+        __m256i abs_result = _mm256_abs_epi32(result);
+
+        sad_acc = _mm256_add_epi32(sad_acc, abs_result);
+    }
+
+    // Horizontal reduction of sad_acc (8 x int32 -> scalar)
+    __m128i lo128 = _mm256_castsi256_si128(sad_acc);
+    __m128i hi128 = _mm256_extracti128_si256(sad_acc, 1);
+    __m128i sum128 = _mm_add_epi32(lo128, hi128);
+    sum128 = _mm_add_epi32(sum128,
+                _mm_shuffle_epi32(sum128, _MM_SHUFFLE(1, 0, 3, 2)));
+    sum128 = _mm_add_epi32(sum128,
+                _mm_shuffle_epi32(sum128, _MM_SHUFFLE(0, 1, 0, 1)));
+    row_sad += (uint32_t)_mm_cvtsi128_si32(sum128);
+
+    // Scalar right edge + tail
+    for (; j < w; j++) {
+        int64_t accum = 0;
+        for (int k = 0; k < 5; k++) {
+            int col = mirror((int)j - 2 + k, (int)w);
+            accum += (int64_t)filter[k] * y_row[col];
+        }
+        int32_t val = (int32_t)((accum + (1 << 15)) >> 16);
+        row_sad += abs(val);
+    }
+
+    return row_sad;
+}
+
+uint64_t motion_score_pipeline_16_avx2(const uint8_t *prev_u8, ptrdiff_t prev_stride,
+                                       const uint8_t *cur_u8, ptrdiff_t cur_stride,
+                                       int32_t *y_row, unsigned w, unsigned h,
+                                       unsigned bpc)
+{
+    const uint16_t *prev = (const uint16_t *)prev_u8;
+    const uint16_t *cur = (const uint16_t *)cur_u8;
+    const ptrdiff_t p_stride = prev_stride / 2;
+    const ptrdiff_t c_stride = cur_stride / 2;
+
+    const __m256i g0 = _mm256_set1_epi32(3571);
+    const __m256i g1 = _mm256_set1_epi32(16004);
+    const __m256i g2 = _mm256_set1_epi32(26386);
+    const __m256i round64 = _mm256_set1_epi64x(1 << (bpc - 1));
+    const __m256i bpc_vec = _mm256_set1_epi64x(bpc);
+    const __m256i perm_idx = _mm256_setr_epi32(0, 2, 4, 6, 0, 0, 0, 0);
+
+    uint64_t sad = 0;
+
+    for (unsigned i = 0; i < h; i++) {
+        const uint16_t *pp[5], *cp[5];
+        for (int k = 0; k < 5; k++) {
+            int r = mirror((int)i - 2 + k, (int)h);
+            pp[k] = prev + r * p_stride;
+            cp[k] = cur + r * c_stride;
+        }
+
+        // Phase 1: diff + y_conv -> y_row (8 pixels at a time, int64 accum)
+        unsigned j;
+        __m256i nz_acc = _mm256_setzero_si256();
+        for (j = 0; j + 8 <= w; j += 8) {
+            __m256i d0 = _mm256_sub_epi32(
+                _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)(pp[0] + j))),
+                _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)(cp[0] + j))));
+            __m256i d1 = _mm256_sub_epi32(
+                _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)(pp[1] + j))),
+                _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)(cp[1] + j))));
+            __m256i d2 = _mm256_sub_epi32(
+                _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)(pp[2] + j))),
+                _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)(cp[2] + j))));
+            __m256i d3 = _mm256_sub_epi32(
+                _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)(pp[3] + j))),
+                _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)(cp[3] + j))));
+            __m256i d4 = _mm256_sub_epi32(
+                _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)(pp[4] + j))),
+                _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)(cp[4] + j))));
+
+            __m256i prod0 = _mm256_mullo_epi32(d0, g0);
+            __m256i prod1 = _mm256_mullo_epi32(d1, g1);
+            __m256i prod2 = _mm256_mullo_epi32(d2, g2);
+            __m256i prod3 = _mm256_mullo_epi32(d3, g1);
+            __m256i prod4 = _mm256_mullo_epi32(d4, g0);
+
+            __m256i acc_lo = _mm256_cvtepi32_epi64(_mm256_castsi256_si128(prod0));
+            acc_lo = _mm256_add_epi64(acc_lo, _mm256_cvtepi32_epi64(_mm256_castsi256_si128(prod1)));
+            acc_lo = _mm256_add_epi64(acc_lo, _mm256_cvtepi32_epi64(_mm256_castsi256_si128(prod2)));
+            acc_lo = _mm256_add_epi64(acc_lo, _mm256_cvtepi32_epi64(_mm256_castsi256_si128(prod3)));
+            acc_lo = _mm256_add_epi64(acc_lo, _mm256_cvtepi32_epi64(_mm256_castsi256_si128(prod4)));
+
+            __m256i acc_hi = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(prod0, 1));
+            acc_hi = _mm256_add_epi64(acc_hi, _mm256_cvtepi32_epi64(_mm256_extracti128_si256(prod1, 1)));
+            acc_hi = _mm256_add_epi64(acc_hi, _mm256_cvtepi32_epi64(_mm256_extracti128_si256(prod2, 1)));
+            acc_hi = _mm256_add_epi64(acc_hi, _mm256_cvtepi32_epi64(_mm256_extracti128_si256(prod3, 1)));
+            acc_hi = _mm256_add_epi64(acc_hi, _mm256_cvtepi32_epi64(_mm256_extracti128_si256(prod4, 1)));
+
+            acc_lo = _mm256_srlv_epi64(_mm256_add_epi64(acc_lo, round64), bpc_vec);
+            acc_hi = _mm256_srlv_epi64(_mm256_add_epi64(acc_hi, round64), bpc_vec);
+
+            __m128i res_lo = _mm256_castsi256_si128(
+                _mm256_permutevar8x32_epi32(acc_lo, perm_idx));
+            __m128i res_hi = _mm256_castsi256_si128(
+                _mm256_permutevar8x32_epi32(acc_hi, perm_idx));
+
+            __m256i result = _mm256_inserti128_si256(
+                _mm256_castsi128_si256(res_lo), res_hi, 1);
+            _mm256_storeu_si256((__m256i*)(y_row + j), result);
+            nz_acc = _mm256_or_si256(nz_acc, result);
+        }
+
+        // Scalar tail for phase 1
+        int32_t nz_tail = 0;
+        for (; j < w; j++) {
+            int64_t accum = 0;
+            for (int k = 0; k < 5; k++) {
+                int32_t diff = pp[k][j] - cp[k][j];
+                accum += (int64_t)filter[k] * diff;
+            }
+            y_row[j] = (int32_t)((accum + (1 << (bpc - 1))) >> bpc);
+            nz_tail |= y_row[j];
+        }
+
+        if (_mm256_testz_si256(nz_acc, nz_acc) && !nz_tail) continue;
+
+        // Phase 2: SIMD x_conv + abs + accumulate
+        sad += x_conv_row_sad_avx2(y_row, w);
+    }
+
+    return sad;
+}
+
+uint64_t motion_score_pipeline_8_avx2(const uint8_t *prev, ptrdiff_t prev_stride,
+                                      const uint8_t *cur, ptrdiff_t cur_stride,
+                                      int32_t *y_row, unsigned w, unsigned h,
+                                      unsigned bpc)
+{
+    (void)bpc;
+    const __m256i f0 = _mm256_set1_epi16(3571);
+    const __m256i f1 = _mm256_set1_epi16(16004);
+    const __m256i f2 = _mm256_set1_epi16(26386);
+    const __m256i round8 = _mm256_set1_epi32(1 << 7);
+
+    uint64_t sad = 0;
+
+    for (unsigned i = 0; i < h; i++) {
+        const uint8_t *p[5], *c[5];
+        for (int k = 0; k < 5; k++) {
+            int r = mirror((int)i - 2 + k, (int)h);
+            p[k] = prev + r * prev_stride;
+            c[k] = cur + r * cur_stride;
+        }
+
+        // Phase 1: diff + y_conv -> y_row (16 columns at a time, shift >>8)
+        unsigned j;
+        __m256i nz_acc = _mm256_setzero_si256();
+        for (j = 0; j + 16 <= w; j += 16) {
+            __m256i d0 = _mm256_sub_epi16(
+                _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*)(p[0] + j))),
+                _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*)(c[0] + j))));
+            __m256i d1 = _mm256_sub_epi16(
+                _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*)(p[1] + j))),
+                _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*)(c[1] + j))));
+            __m256i d2 = _mm256_sub_epi16(
+                _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*)(p[2] + j))),
+                _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*)(c[2] + j))));
+            __m256i d3 = _mm256_sub_epi16(
+                _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*)(p[3] + j))),
+                _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*)(c[3] + j))));
+            __m256i d4 = _mm256_sub_epi16(
+                _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*)(p[4] + j))),
+                _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*)(c[4] + j))));
+
+            __m256i lo = _mm256_mullo_epi16(d0, f0);
+            __m256i hi = _mm256_mulhi_epi16(d0, f0);
+            __m256i acc_lo = _mm256_unpacklo_epi16(lo, hi);
+            __m256i acc_hi = _mm256_unpackhi_epi16(lo, hi);
+
+            lo = _mm256_mullo_epi16(d1, f1);
+            hi = _mm256_mulhi_epi16(d1, f1);
+            acc_lo = _mm256_add_epi32(acc_lo, _mm256_unpacklo_epi16(lo, hi));
+            acc_hi = _mm256_add_epi32(acc_hi, _mm256_unpackhi_epi16(lo, hi));
+
+            lo = _mm256_mullo_epi16(d2, f2);
+            hi = _mm256_mulhi_epi16(d2, f2);
+            acc_lo = _mm256_add_epi32(acc_lo, _mm256_unpacklo_epi16(lo, hi));
+            acc_hi = _mm256_add_epi32(acc_hi, _mm256_unpackhi_epi16(lo, hi));
+
+            lo = _mm256_mullo_epi16(d3, f1);
+            hi = _mm256_mulhi_epi16(d3, f1);
+            acc_lo = _mm256_add_epi32(acc_lo, _mm256_unpacklo_epi16(lo, hi));
+            acc_hi = _mm256_add_epi32(acc_hi, _mm256_unpackhi_epi16(lo, hi));
+
+            lo = _mm256_mullo_epi16(d4, f0);
+            hi = _mm256_mulhi_epi16(d4, f0);
+            acc_lo = _mm256_add_epi32(acc_lo, _mm256_unpacklo_epi16(lo, hi));
+            acc_hi = _mm256_add_epi32(acc_hi, _mm256_unpackhi_epi16(lo, hi));
+
+            acc_lo = _mm256_srai_epi32(_mm256_add_epi32(acc_lo, round8), 8);
+            acc_hi = _mm256_srai_epi32(_mm256_add_epi32(acc_hi, round8), 8);
+
+            __m256i cols_0_7  = _mm256_permute2x128_si256(acc_lo, acc_hi, 0x20);
+            __m256i cols_8_15 = _mm256_permute2x128_si256(acc_lo, acc_hi, 0x31);
+            _mm256_storeu_si256((__m256i*)(y_row + j), cols_0_7);
+            _mm256_storeu_si256((__m256i*)(y_row + j + 8), cols_8_15);
+            nz_acc = _mm256_or_si256(nz_acc, _mm256_or_si256(cols_0_7, cols_8_15));
+        }
+
+        // Scalar tail for phase 1
+        int32_t nz_tail = 0;
+        for (; j < w; j++) {
+            int32_t accum = 0;
+            for (int k = 0; k < 5; k++) {
+                int32_t diff = p[k][j] - c[k][j];
+                accum += (int32_t)filter[k] * diff;
+            }
+            y_row[j] = (accum + (1 << 7)) >> 8;
+            nz_tail |= y_row[j];
+        }
+
+        if (_mm256_testz_si256(nz_acc, nz_acc) && !nz_tail) continue;
+
+        // Phase 2: SIMD x_conv + abs + accumulate
+        sad += x_conv_row_sad_avx2(y_row, w);
+    }
+
+    return sad;
+}
diff --git a/libvmaf/src/feature/x86/motion_v2_avx2.h b/libvmaf/src/feature/x86/motion_v2_avx2.h
new file mode 100644
index 000000000..333243af2
--- /dev/null
+++ b/libvmaf/src/feature/x86/motion_v2_avx2.h
@@ -0,0 +1,35 @@
+/**
+ *
+ *  Copyright 2016-2025 Netflix, Inc.
+ *
+ *     Licensed under the BSD+Patent License (the "License");
+ *     you may not use this file except in compliance with the License.
+ *     You may obtain a copy of the License at
+ *
+ *         https://opensource.org/licenses/BSDplusPatent
+ *
+ *     Unless required by applicable law or agreed to in writing, software
+ *     distributed under the License is distributed on an "AS IS" BASIS,
+ *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *     See the License for the specific language governing permissions and
+ *     limitations under the License.
+ *
+ */
+
+#ifndef X86_AVX2_MOTION_V2_H_
+#define X86_AVX2_MOTION_V2_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+uint64_t motion_score_pipeline_8_avx2(const uint8_t *prev, ptrdiff_t prev_stride,
+                                      const uint8_t *cur, ptrdiff_t cur_stride,
+                                      int32_t *y_row, unsigned w, unsigned h,
+                                      unsigned bpc);
+
+uint64_t motion_score_pipeline_16_avx2(const uint8_t *prev, ptrdiff_t prev_stride,
+                                       const uint8_t *cur, ptrdiff_t cur_stride,
+                                       int32_t *y_row, unsigned w, unsigned h,
+                                       unsigned bpc);
+
+#endif /* X86_AVX2_MOTION_V2_H_ */
diff --git a/libvmaf/src/feature/x86/motion_v2_avx512.c b/libvmaf/src/feature/x86/motion_v2_avx512.c
new file mode 100644
index 000000000..218fc61eb
--- /dev/null
+++ b/libvmaf/src/feature/x86/motion_v2_avx512.c
@@ -0,0 +1,314 @@
+/**
+ *
+ *  Copyright 2016-2025 Netflix, Inc.
+ *
+ *     Licensed under the BSD+Patent License (the "License");
+ *     you may not use this file except in compliance with the License.
+ *     You may obtain a copy of the License at
+ *
+ *         https://opensource.org/licenses/BSDplusPatent
+ *
+ *     Unless required by applicable law or agreed to in writing, software
+ *     distributed under the License is distributed on an "AS IS" BASIS,
+ *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *     See the License for the specific language governing permissions and
+ *     limitations under the License.
+ *
+ */
+
+#include <immintrin.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "feature/integer_motion.h"
+
+static inline int mirror(int idx, int size)
+{
+    if (idx < 0) return -idx;
+    if (idx >= size) return 2 * size - idx - 1;
+    return idx;
+}
+
+// SIMD phase 2: x_conv + abs + SAD for one row of int32 y_row.
+// Processes 16 int32 columns at a time via mullo_epi32 + int64 accumulation.
+static inline uint32_t
+x_conv_row_sad_avx512(const int32_t *y_row, unsigned w)
+{
+    const __m512i g0 = _mm512_set1_epi32(3571);
+    const __m512i g1 = _mm512_set1_epi32(16004);
+    const __m512i g2 = _mm512_set1_epi32(26386);
+    const __m512i round64 = _mm512_set1_epi64(1 << 15);
+
+    uint32_t row_sad = 0;
+
+    // Scalar left edge (columns 0, 1) — mirror boundary
+    unsigned j;
+    for (j = 0; j < 2 && j < w; j++) {
+        int64_t accum = 0;
+        for (int k = 0; k < 5; k++) {
+            int col = mirror((int)j - 2 + k, (int)w);
+            accum += (int64_t)filter[k] * y_row[col];
+        }
+        int32_t val = (int32_t)((accum + (1 << 15)) >> 16);
+        row_sad += abs(val);
+    }
+
+    // SIMD middle: need y_row[j-2]..y_row[j+17], so j+18 <= w
+    __m512i sad_acc = _mm512_setzero_si512();
+    for (; j + 18 <= w; j += 16) {
+        __m512i y0 = _mm512_loadu_si512((__m512i*)(y_row + j - 2));
+        __m512i y1 = _mm512_loadu_si512((__m512i*)(y_row + j - 1));
+        __m512i y2 = _mm512_loadu_si512((__m512i*)(y_row + j));
+        __m512i y3 = _mm512_loadu_si512((__m512i*)(y_row + j + 1));
+        __m512i y4 = _mm512_loadu_si512((__m512i*)(y_row + j + 2));
+
+        // Each product fits in int32
+        __m512i p0 = _mm512_mullo_epi32(y0, g0);
+        __m512i p1 = _mm512_mullo_epi32(y1, g1);
+        __m512i p2 = _mm512_mullo_epi32(y2, g2);
+        __m512i p3 = _mm512_mullo_epi32(y3, g1);
+        __m512i p4 = _mm512_mullo_epi32(y4, g0);
+
+        // Safe pairs that fit in int32
+        __m512i s04 = _mm512_add_epi32(p0, p4);
+        __m512i s13 = _mm512_add_epi32(p1, p3);
+
+        // Widen to int64 and accumulate (lo 8 elements)
+        __m512i acc_lo = _mm512_cvtepi32_epi64(_mm512_castsi512_si256(s04));
+        acc_lo = _mm512_add_epi64(acc_lo, _mm512_cvtepi32_epi64(_mm512_castsi512_si256(s13)));
+        acc_lo = _mm512_add_epi64(acc_lo, _mm512_cvtepi32_epi64(_mm512_castsi512_si256(p2)));
+
+        // hi 8 elements
+        __m512i acc_hi = _mm512_cvtepi32_epi64(_mm512_extracti64x4_epi64(s04, 1));
+        acc_hi = _mm512_add_epi64(acc_hi, _mm512_cvtepi32_epi64(_mm512_extracti64x4_epi64(s13, 1)));
+        acc_hi = _mm512_add_epi64(acc_hi, _mm512_cvtepi32_epi64(_mm512_extracti64x4_epi64(p2, 1)));
+
+        // Round and arithmetic right shift >>16 (native in AVX-512)
+        acc_lo = _mm512_srai_epi64(_mm512_add_epi64(acc_lo, round64), 16);
+        acc_hi = _mm512_srai_epi64(_mm512_add_epi64(acc_hi, round64), 16);
+
+        // Narrow int64 -> int32 (signed saturation)
+        __m256i res_lo = _mm512_cvtsepi64_epi32(acc_lo);
+        __m256i res_hi = _mm512_cvtsepi64_epi32(acc_hi);
+
+        // Combine into 16 x int32, abs, accumulate
+        __m512i result = _mm512_inserti64x4(_mm512_castsi256_si512(res_lo), res_hi, 1);
+        __m512i abs_result = _mm512_abs_epi32(result);
+
+        sad_acc = _mm512_add_epi32(sad_acc, abs_result);
+    }
+
+    row_sad += (uint32_t)_mm512_reduce_add_epi32(sad_acc);
+
+    // Scalar right edge + tail
+    for (; j < w; j++) {
+        int64_t accum = 0;
+        for (int k = 0; k < 5; k++) {
+            int col = mirror((int)j - 2 + k, (int)w);
+            accum += (int64_t)filter[k] * y_row[col];
+        }
+        int32_t val = (int32_t)((accum + (1 << 15)) >> 16);
+        row_sad += abs(val);
+    }
+
+    return row_sad;
+}
+
+uint64_t motion_score_pipeline_16_avx512(const uint8_t *prev_u8, ptrdiff_t prev_stride,
+                                         const uint8_t *cur_u8, ptrdiff_t cur_stride,
+                                         int32_t *y_row, unsigned w, unsigned h,
+                                         unsigned bpc)
+{
+    const uint16_t *prev = (const uint16_t *)prev_u8;
+    const uint16_t *cur = (const uint16_t *)cur_u8;
+    const ptrdiff_t p_stride = prev_stride / 2;
+    const ptrdiff_t c_stride = cur_stride / 2;
+
+    const __m512i g0 = _mm512_set1_epi32(3571);
+    const __m512i g1 = _mm512_set1_epi32(16004);
+    const __m512i g2 = _mm512_set1_epi32(26386);
+    const __m512i round64 = _mm512_set1_epi64(1 << (bpc - 1));
+    const __m512i bpc_vec = _mm512_set1_epi64(bpc);
+
+    uint64_t sad = 0;
+
+    for (unsigned i = 0; i < h; i++) {
+        const uint16_t *pp[5], *cp[5];
+        for (int k = 0; k < 5; k++) {
+            int r = mirror((int)i - 2 + k, (int)h);
+            pp[k] = prev + r * p_stride;
+            cp[k] = cur + r * c_stride;
+        }
+
+        unsigned j;
+        __m512i nz_acc = _mm512_setzero_si512();
+        for (j = 0; j + 16 <= w; j += 16) {
+            __m512i d0 = _mm512_sub_epi32(
+                _mm512_cvtepu16_epi32(_mm256_loadu_si256((__m256i*)(pp[0] + j))),
+                _mm512_cvtepu16_epi32(_mm256_loadu_si256((__m256i*)(cp[0] + j))));
+            __m512i d1 = _mm512_sub_epi32(
+                _mm512_cvtepu16_epi32(_mm256_loadu_si256((__m256i*)(pp[1] + j))),
+                _mm512_cvtepu16_epi32(_mm256_loadu_si256((__m256i*)(cp[1] + j))));
+            __m512i d2 = _mm512_sub_epi32(
+                _mm512_cvtepu16_epi32(_mm256_loadu_si256((__m256i*)(pp[2] + j))),
+                _mm512_cvtepu16_epi32(_mm256_loadu_si256((__m256i*)(cp[2] + j))));
+            __m512i d3 = _mm512_sub_epi32(
+                _mm512_cvtepu16_epi32(_mm256_loadu_si256((__m256i*)(pp[3] + j))),
+                _mm512_cvtepu16_epi32(_mm256_loadu_si256((__m256i*)(cp[3] + j))));
+            __m512i d4 = _mm512_sub_epi32(
+                _mm512_cvtepu16_epi32(_mm256_loadu_si256((__m256i*)(pp[4] + j))),
+                _mm512_cvtepu16_epi32(_mm256_loadu_si256((__m256i*)(cp[4] + j))));
+
+            __m512i prod0 = _mm512_mullo_epi32(d0, g0);
+            __m512i prod1 = _mm512_mullo_epi32(d1, g1);
+            __m512i prod2 = _mm512_mullo_epi32(d2, g2);
+            __m512i prod3 = _mm512_mullo_epi32(d3, g1);
+            __m512i prod4 = _mm512_mullo_epi32(d4, g0);
+
+            __m512i acc_lo = _mm512_cvtepi32_epi64(_mm512_castsi512_si256(prod0));
+            acc_lo = _mm512_add_epi64(acc_lo, _mm512_cvtepi32_epi64(_mm512_castsi512_si256(prod1)));
+            acc_lo = _mm512_add_epi64(acc_lo, _mm512_cvtepi32_epi64(_mm512_castsi512_si256(prod2)));
+            acc_lo = _mm512_add_epi64(acc_lo, _mm512_cvtepi32_epi64(_mm512_castsi512_si256(prod3)));
+            acc_lo = _mm512_add_epi64(acc_lo, _mm512_cvtepi32_epi64(_mm512_castsi512_si256(prod4)));
+
+            __m512i acc_hi = _mm512_cvtepi32_epi64(_mm512_extracti64x4_epi64(prod0, 1));
+            acc_hi = _mm512_add_epi64(acc_hi, _mm512_cvtepi32_epi64(_mm512_extracti64x4_epi64(prod1, 1)));
+            acc_hi = _mm512_add_epi64(acc_hi, _mm512_cvtepi32_epi64(_mm512_extracti64x4_epi64(prod2, 1)));
+            acc_hi = _mm512_add_epi64(acc_hi, _mm512_cvtepi32_epi64(_mm512_extracti64x4_epi64(prod3, 1)));
+            acc_hi = _mm512_add_epi64(acc_hi, _mm512_cvtepi32_epi64(_mm512_extracti64x4_epi64(prod4, 1)));
+
+            acc_lo = _mm512_srav_epi64(_mm512_add_epi64(acc_lo, round64), bpc_vec);
+            acc_hi = _mm512_srav_epi64(_mm512_add_epi64(acc_hi, round64), bpc_vec);
+
+            __m256i res_lo = _mm512_cvtsepi64_epi32(acc_lo);
+            __m256i res_hi = _mm512_cvtsepi64_epi32(acc_hi);
+
+            __m512i result = _mm512_inserti64x4(_mm512_castsi256_si512(res_lo), res_hi, 1);
+            _mm512_storeu_si512((__m512i*)(y_row + j), result);
+            nz_acc = _mm512_or_si512(nz_acc, result);
+        }
+
+        int32_t nz_tail = 0;
+        for (; j < w; j++) {
+            int64_t accum = 0;
+            for (int k = 0; k < 5; k++) {
+                int32_t diff = pp[k][j] - cp[k][j];
+                accum += (int64_t)filter[k] * diff;
+            }
+            y_row[j] = (int32_t)((accum + (1 << (bpc - 1))) >> bpc);
+            nz_tail |= y_row[j];
+        }
+
+        if (_mm512_test_epi32_mask(nz_acc, nz_acc) == 0 && !nz_tail) continue;
+
+        sad += x_conv_row_sad_avx512(y_row, w);
+    }
+
+    return sad;
+}
+
+uint64_t motion_score_pipeline_8_avx512(const uint8_t *prev, ptrdiff_t prev_stride,
+                                        const uint8_t *cur, ptrdiff_t cur_stride,
+                                        int32_t *y_row, unsigned w, unsigned h,
+                                        unsigned bpc)
+{
+    (void)bpc;
+    const __m512i f0 = _mm512_set1_epi16(3571);
+    const __m512i f1 = _mm512_set1_epi16(16004);
+    const __m512i f2 = _mm512_set1_epi16(26386);
+    const __m512i round8 = _mm512_set1_epi32(1 << 7);
+
+    uint64_t sad = 0;
+
+    for (unsigned i = 0; i < h; i++) {
+        const uint8_t *p[5], *c[5];
+        for (int k = 0; k < 5; k++) {
+            int r = mirror((int)i - 2 + k, (int)h);
+            p[k] = prev + r * prev_stride;
+            c[k] = cur + r * cur_stride;
+        }
+
+        unsigned j;
+        __m512i nz_acc = _mm512_setzero_si512();
+        for (j = 0; j + 32 <= w; j += 32) {
+            __m512i d0 = _mm512_sub_epi16(
+                _mm512_cvtepu8_epi16(_mm256_loadu_si256((__m256i*)(p[0] + j))),
+                _mm512_cvtepu8_epi16(_mm256_loadu_si256((__m256i*)(c[0] + j))));
+            __m512i d1 = _mm512_sub_epi16(
+                _mm512_cvtepu8_epi16(_mm256_loadu_si256((__m256i*)(p[1] + j))),
+                _mm512_cvtepu8_epi16(_mm256_loadu_si256((__m256i*)(c[1] + j))));
+            __m512i d2 = _mm512_sub_epi16(
+                _mm512_cvtepu8_epi16(_mm256_loadu_si256((__m256i*)(p[2] + j))),
+                _mm512_cvtepu8_epi16(_mm256_loadu_si256((__m256i*)(c[2] + j))));
+            __m512i d3 = _mm512_sub_epi16(
+                _mm512_cvtepu8_epi16(_mm256_loadu_si256((__m256i*)(p[3] + j))),
+                _mm512_cvtepu8_epi16(_mm256_loadu_si256((__m256i*)(c[3] + j))));
+            __m512i d4 = _mm512_sub_epi16(
+                _mm512_cvtepu8_epi16(_mm256_loadu_si256((__m256i*)(p[4] + j))),
+                _mm512_cvtepu8_epi16(_mm256_loadu_si256((__m256i*)(c[4] + j))));
+
+            __m512i lo = _mm512_mullo_epi16(d0, f0);
+            __m512i hi = _mm512_mulhi_epi16(d0, f0);
+            __m512i acc_lo = _mm512_unpacklo_epi16(lo, hi);
+            __m512i acc_hi = _mm512_unpackhi_epi16(lo, hi);
+
+            lo = _mm512_mullo_epi16(d1, f1); hi = _mm512_mulhi_epi16(d1, f1);
+            acc_lo = _mm512_add_epi32(acc_lo, _mm512_unpacklo_epi16(lo, hi));
+            acc_hi = _mm512_add_epi32(acc_hi, _mm512_unpackhi_epi16(lo, hi));
+
+            lo = _mm512_mullo_epi16(d2, f2); hi = _mm512_mulhi_epi16(d2, f2);
+            acc_lo = _mm512_add_epi32(acc_lo, _mm512_unpacklo_epi16(lo, hi));
+            acc_hi = _mm512_add_epi32(acc_hi, _mm512_unpackhi_epi16(lo, hi));
+
+            lo = _mm512_mullo_epi16(d3, f1); hi = _mm512_mulhi_epi16(d3, f1);
+            acc_lo = _mm512_add_epi32(acc_lo, _mm512_unpacklo_epi16(lo, hi));
+            acc_hi = _mm512_add_epi32(acc_hi, _mm512_unpackhi_epi16(lo, hi));
+
+            lo = _mm512_mullo_epi16(d4, f0); hi = _mm512_mulhi_epi16(d4, f0);
+            acc_lo = _mm512_add_epi32(acc_lo, _mm512_unpacklo_epi16(lo, hi));
+            acc_hi = _mm512_add_epi32(acc_hi, _mm512_unpackhi_epi16(lo, hi));
+
+            acc_lo = _mm512_srai_epi32(_mm512_add_epi32(acc_lo, round8), 8);
+            acc_hi = _mm512_srai_epi32(_mm512_add_epi32(acc_hi, round8), 8);
+
+            __m256i lo_lo = _mm512_castsi512_si256(acc_lo);
+            __m256i lo_hi = _mm512_extracti64x4_epi64(acc_lo, 1);
+            __m256i hi_lo = _mm512_castsi512_si256(acc_hi);
+            __m256i hi_hi = _mm512_extracti64x4_epi64(acc_hi, 1);
+
+            __m256i cols_0_7   = _mm256_permute2x128_si256(lo_lo, hi_lo, 0x20);
+            __m256i cols_8_15  = _mm256_permute2x128_si256(lo_lo, hi_lo, 0x31);
+            __m256i cols_16_23 = _mm256_permute2x128_si256(lo_hi, hi_hi, 0x20);
+            __m256i cols_24_31 = _mm256_permute2x128_si256(lo_hi, hi_hi, 0x31);
+
+            _mm256_storeu_si256((__m256i*)(y_row + j),      cols_0_7);
+            _mm256_storeu_si256((__m256i*)(y_row + j + 8),  cols_8_15);
+            _mm256_storeu_si256((__m256i*)(y_row + j + 16), cols_16_23);
+            _mm256_storeu_si256((__m256i*)(y_row + j + 24), cols_24_31);
+
+            __m512i stored = _mm512_inserti64x4(
+                _mm512_castsi256_si512(
+                    _mm256_or_si256(cols_0_7, cols_8_15)),
+                _mm256_or_si256(cols_16_23, cols_24_31), 1);
+            nz_acc = _mm512_or_si512(nz_acc, stored);
+        }
+
+        int32_t nz_tail = 0;
+        for (; j < w; j++) {
+            int32_t accum = 0;
+            for (int k = 0; k < 5; k++) {
+                int32_t diff = p[k][j] - c[k][j];
+                accum += (int32_t)filter[k] * diff;
+            }
+            y_row[j] = (accum + (1 << 7)) >> 8;
+            nz_tail |= y_row[j];
+        }
+
+        if (_mm512_test_epi32_mask(nz_acc, nz_acc) == 0 && !nz_tail) continue;
+
+        sad += x_conv_row_sad_avx512(y_row, w);
+    }
+
+    return sad;
+}
diff --git a/libvmaf/src/feature/x86/motion_v2_avx512.h b/libvmaf/src/feature/x86/motion_v2_avx512.h
new file mode 100644
index 000000000..f4eda6a71
--- /dev/null
+++ b/libvmaf/src/feature/x86/motion_v2_avx512.h
@@ -0,0 +1,35 @@
+/**
+ *
+ *  Copyright 2016-2025 Netflix, Inc.
+ *
+ *     Licensed under the BSD+Patent License (the "License");
+ *     you may not use this file except in compliance with the License.
+ *     You may obtain a copy of the License at
+ *
+ *         https://opensource.org/licenses/BSDplusPatent
+ *
+ *     Unless required by applicable law or agreed to in writing, software
+ *     distributed under the License is distributed on an "AS IS" BASIS,
+ *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *     See the License for the specific language governing permissions and
+ *     limitations under the License.
+ *
+ */
+
+#ifndef X86_AVX512_MOTION_V2_H_
+#define X86_AVX512_MOTION_V2_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+uint64_t motion_score_pipeline_8_avx512(const uint8_t *prev, ptrdiff_t prev_stride,
+                                        const uint8_t *cur, ptrdiff_t cur_stride,
+                                        int32_t *y_row, unsigned w, unsigned h,
+                                        unsigned bpc);
+
+uint64_t motion_score_pipeline_16_avx512(const uint8_t *prev, ptrdiff_t prev_stride,
+                                         const uint8_t *cur, ptrdiff_t cur_stride,
+                                         int32_t *y_row, unsigned w, unsigned h,
+                                         unsigned bpc);
+
+#endif /* X86_AVX512_MOTION_V2_H_ */
diff --git a/libvmaf/src/libvmaf.c b/libvmaf/src/libvmaf.c
index ee95b202b..49bd059be 100644
--- a/libvmaf/src/libvmaf.c
+++ b/libvmaf/src/libvmaf.c
@@ -90,6 +90,7 @@ typedef struct VmafContext {
     } pic_params;
     unsigned pic_cnt;
     bool flushed;
+    VmafPicture prev_ref; // previous ref pic for PREV_REF extractors (in-order only)
 } VmafContext;
 
 #ifdef VMAF_BATCH_THREADING
@@ -348,6 +349,8 @@ int vmaf_close(VmafContext *vmaf)
     if (!vmaf) return -EINVAL;
 
     vmaf_thread_pool_wait(vmaf->thread_pool);
+    if (vmaf->prev_ref.ref)
+        vmaf_picture_unref(&vmaf->prev_ref);
     vmaf_framesync_destroy(vmaf->framesync);
     feature_extractor_vector_destroy(&(vmaf->registered_feature_extractors));
     vmaf_feature_collector_destroy(vmaf->feature_collector);
@@ -483,7 +486,7 @@ int vmaf_use_features_from_model_collection(VmafContext *vmaf,
 
 struct ThreadData {
     VmafFeatureExtractorContext *fex_ctx;
-    VmafPicture ref, dist;
+    VmafPicture ref, dist, prev_ref;
     unsigned index;
     VmafFeatureCollector *feature_collector;
     VmafFeatureExtractorContextPool *fex_ctx_pool;
@@ -494,9 +497,19 @@ static void threaded_extract_func(void *e, void **thread_data)
 {
     (void) thread_data;
     struct ThreadData *f = e;
+
+    if (f->prev_ref.ref)
+        f->fex_ctx->fex->prev_ref = f->prev_ref;
+
     f->err = vmaf_feature_extractor_context_extract(f->fex_ctx, &f->ref, NULL,
                                                     &f->dist, NULL, f->index,
                                                     f->feature_collector);
+
+    if (f->prev_ref.ref) {
+        memset(&f->fex_ctx->fex->prev_ref, 0, sizeof(f->fex_ctx->fex->prev_ref));
+        vmaf_picture_unref(&f->prev_ref);
+    }
+
     f->err = vmaf_fex_ctx_pool_release(f->fex_ctx_pool, f->fex_ctx);
     vmaf_picture_unref(&f->ref);
     vmaf_picture_unref(&f->dist);
@@ -504,7 +517,7 @@ static void threaded_extract_func(void *e, void **thread_data)
 
 #ifdef VMAF_BATCH_THREADING
 struct ThreadDataBatch {
-    VmafPicture ref, dist;
+    VmafPicture ref, dist, prev_ref;
     unsigned index;
     VmafFeatureCollector *feature_collector;
     RegisteredFeatureExtractors *registered_fex;
@@ -551,11 +564,21 @@ static void threaded_extract_batch_func(void *e, void **thread_data)
             if (err) { f->err = err; break; }
         }
 
+        if (fex->flags & VMAF_FEATURE_EXTRACTOR_PREV_REF) {
+            if (f->prev_ref.ref)
+                td->fex_ctx[i]->fex->prev_ref = f->prev_ref;
+        }
+
         int err = vmaf_feature_extractor_context_extract(td->fex_ctx[i],
                                                          &f->ref, NULL,
                                                          &f->dist, NULL,
                                                          f->index,
                                                          f->feature_collector);
+
+        if (fex->flags & VMAF_FEATURE_EXTRACTOR_PREV_REF)
+            memset(&td->fex_ctx[i]->fex->prev_ref, 0,
+                   sizeof(td->fex_ctx[i]->fex->prev_ref));
+
         if (err) {
             f->err = err;
             break;
@@ -563,6 +586,8 @@ static void threaded_extract_batch_func(void *e, void **thread_data)
     }
 
 unref:
+    if (f->prev_ref.ref)
+        vmaf_picture_unref(&f->prev_ref);
     vmaf_picture_unref(&f->ref);
     vmaf_picture_unref(&f->dist);
 }
@@ -597,14 +622,21 @@ static int threaded_read_pictures(VmafContext *vmaf, VmafPicture *ref,
                                        &fex_ctx);
         if (err) return err;
 
-        VmafPicture pic_a, pic_b;
+        VmafPicture pic_a, pic_b, prev_ref = { 0 };
         vmaf_picture_ref(&pic_a, ref);
         vmaf_picture_ref(&pic_b, dist);
 
+        if ((fex->flags & VMAF_FEATURE_EXTRACTOR_PREV_REF) &&
+            vmaf->prev_ref.ref)
+        {
+            vmaf_picture_ref(&prev_ref, &vmaf->prev_ref);
+        }
+
         struct ThreadData data = {
             .fex_ctx = fex_ctx,
             .ref = pic_a,
             .dist = pic_b,
+            .prev_ref = prev_ref,
             .index = index,
             .feature_collector = vmaf->feature_collector,
             .fex_ctx_pool = vmaf->fex_ctx_pool,
@@ -616,10 +648,15 @@ static int threaded_read_pictures(VmafContext *vmaf, VmafPicture *ref,
         if (err) {
             vmaf_picture_unref(&pic_a);
             vmaf_picture_unref(&pic_b);
+            if (prev_ref.ref) vmaf_picture_unref(&prev_ref);
             return err;
         }
     }
 
+    if (vmaf->prev_ref.ref)
+        vmaf_picture_unref(&vmaf->prev_ref);
+    vmaf_picture_ref(&vmaf->prev_ref, ref);
+
     return vmaf_picture_unref(ref) | vmaf_picture_unref(dist);
 }
 
@@ -633,13 +670,17 @@ static int threaded_read_pictures_batch(VmafContext *vmaf, VmafPicture *ref,
 
     int err = 0;
 
-    VmafPicture pic_a, pic_b;
+    VmafPicture pic_a, pic_b, prev_ref = { 0 };
     vmaf_picture_ref(&pic_a, ref);
     vmaf_picture_ref(&pic_b, dist);
 
+    if (vmaf->prev_ref.ref)
+        vmaf_picture_ref(&prev_ref, &vmaf->prev_ref);
+
     struct ThreadDataBatch data = {
         .ref = pic_a,
         .dist = pic_b,
+        .prev_ref = prev_ref,
         .index = index,
         .feature_collector = vmaf->feature_collector,
         .registered_fex = &vmaf->registered_feature_extractors,
@@ -652,9 +693,14 @@ static int threaded_read_pictures_batch(VmafContext *vmaf, VmafPicture *ref,
     if (err) {
         vmaf_picture_unref(&pic_a);
         vmaf_picture_unref(&pic_b);
+        if (prev_ref.ref) vmaf_picture_unref(&prev_ref);
         return err;
     }
 
+    if (vmaf->prev_ref.ref)
+        vmaf_picture_unref(&vmaf->prev_ref);
+    vmaf_picture_ref(&vmaf->prev_ref, ref);
+
     return vmaf_picture_unref(ref) | vmaf_picture_unref(dist);
 }
 #endif // VMAF_BATCH_THREADING
@@ -706,6 +752,22 @@ static int flush_context_threaded(VmafContext *vmaf)
     err |= vmaf_fex_ctx_pool_flush(vmaf->fex_ctx_pool, vmaf->feature_collector);
 #endif
 
+    {
+        RegisteredFeatureExtractors rfe = vmaf->registered_feature_extractors;
+        for (unsigned i = 0; i < rfe.cnt; i++) {
+            VmafFeatureExtractor *fex = rfe.fex_ctx[i]->fex;
+            if (fex->flags & VMAF_FEATURE_EXTRACTOR_TEMPORAL)
+                continue;
+            if (fex->flags & VMAF_FEATURE_EXTRACTOR_CUDA)
+                continue;
+            if (!fex->flush)
+                continue;
+            int flush_err = 0;
+            while (!(flush_err = fex->flush(fex, vmaf->feature_collector)));
+            if (flush_err < 0) err |= flush_err;
+        }
+    }
+
     if (!err) vmaf->flushed = true;
     return err;
 }
@@ -918,9 +980,19 @@ int vmaf_read_pictures(VmafContext *vmaf, VmafPicture *ref, VmafPicture *dist,
             &dist_device : &dist_host;
 #endif
 
+        if ((fex_ctx->fex->flags & VMAF_FEATURE_EXTRACTOR_PREV_REF) &&
+            vmaf->prev_ref.ref)
+        {
+            fex_ctx->fex->prev_ref = vmaf->prev_ref;
+        }
+
         err = vmaf_feature_extractor_context_extract(fex_ctx, ref, NULL, dist,
                                                      NULL, index,
                                                      vmaf->feature_collector);
+
+        if (fex_ctx->fex->flags & VMAF_FEATURE_EXTRACTOR_PREV_REF)
+            memset(&fex_ctx->fex->prev_ref, 0, sizeof(fex_ctx->fex->prev_ref));
+
         if (err) return err;
     }
 
@@ -938,6 +1010,10 @@ int vmaf_read_pictures(VmafContext *vmaf, VmafPicture *ref, VmafPicture *dist,
         return threaded_read_pictures(vmaf, ref, dist, index);
 #endif
     }
+
+    if (vmaf->prev_ref.ref)
+        vmaf_picture_unref(&vmaf->prev_ref);
+    vmaf_picture_ref(&vmaf->prev_ref, ref);
 #ifdef HAVE_CUDA
     if (ref_host.priv)
         err |= vmaf_picture_unref(&ref_host);
diff --git a/libvmaf/src/meson.build b/libvmaf/src/meson.build
index 2cc4cea3d..423e027de 100644
--- a/libvmaf/src/meson.build
+++ b/libvmaf/src/meson.build
@@ -239,6 +239,7 @@ if is_asm_enabled
       x86_avx2_sources = [
           feature_src_dir + 'common/convolution_avx.c',
           feature_src_dir + 'x86/motion_avx2.c',
+          feature_src_dir + 'x86/motion_v2_avx2.c',
           feature_src_dir + 'x86/vif_avx2.c',
           feature_src_dir + 'x86/adm_avx2.c',
           feature_src_dir + 'x86/cambi_avx2.c',
@@ -256,6 +257,7 @@ if is_asm_enabled
       if is_avx512_enabled and is_avx512_supported
         x86_avx512_sources = [
             feature_src_dir + 'x86/motion_avx512.c',
+            feature_src_dir + 'x86/motion_v2_avx512.c',
             feature_src_dir + 'x86/vif_avx512.c',
             feature_src_dir + 'x86/adm_avx512.c',
         ]
@@ -440,6 +442,7 @@ libvmaf_feature_sources = [
     feature_src_dir + 'integer_adm.c',
     feature_src_dir + 'feature_collector.c',
     feature_src_dir + 'integer_motion.c',
+    feature_src_dir + 'integer_motion_v2.c',
     feature_src_dir + 'integer_vif.c',
     feature_src_dir + 'ciede.c',
     feature_src_dir + 'common/alignment.c',